diff --git a/conversion_utils/conllu_to_tei.py b/conversion_utils/conllu_to_tei.py index 64f0ba3..5915a79 100644 --- a/conversion_utils/conllu_to_tei.py +++ b/conversion_utils/conllu_to_tei.py @@ -12,7 +12,7 @@ class Sentence: self.no_ud = no_ud def add_item(self, token, lemma, upos, upos_other, xpos, misc): - self.items.append([token, lemma, upos, upos_other, xpos, misc == "SpaceAfter=No"]) + self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')]) def add_link(self, link_ref, link_type): self.links.append([link_ref, link_type]) @@ -55,11 +55,11 @@ class Sentence: link_grp = etree.Element('linkGrp') link_grp.set('corresp', '#'+xml_id) link_grp.set('targFunc', 'head argument') - link_grp.set('type', 'JOS-SYN') + link_grp.set('type', system.upper() + '-SYN') for link_id, item in enumerate(self.links): link_ref, link_type = item link = etree.Element('link') - link.set('ana', 'jos-syn:' + link_type) + link.set('ana', system + '-syn:' + link_type.replace(':','_')) if link_ref == u'0': link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1)) else: @@ -71,7 +71,7 @@ class Sentence: class Paragraph: def __init__(self, _id): - self._id = _id + self._id = _id if _id is not None else 'no-id' self.sentences = [] def add_sentence(self, sentence): @@ -265,6 +265,8 @@ def convert_file(input_file_name, output_file_name): tree.write(output_file_name, pretty_print=True, encoding='utf-8') +system = 'jos' # default (TODO: make this cleaner) + if __name__ == '__main__': import argparse from glob import glob @@ -273,6 +275,7 @@ if __name__ == '__main__': parser.add_argument('files', nargs='+', help='CoNNL-U file') parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.') + parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud']) args = parser.parse_args() @@ -281,6 +284,8 @@ if __name__ == '__main__': else: f_out = sys.stdout + system = args.system + for arg in args.files: filelist = glob(arg) for f in filelist: