From e636be1dc238039052ccc33e95711a84adc912b8 Mon Sep 17 00:00:00 2001 From: Luka Date: Thu, 3 Nov 2022 09:31:41 +0000 Subject: [PATCH] Added NER + SRL to conllu_to_tei script --- .gitignore | 2 ++ conversion_utils/conllu_to_tei.py | 55 +++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 854a5a5..c6aa33f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *.pyc venv +data +.idea diff --git a/conversion_utils/conllu_to_tei.py b/conversion_utils/conllu_to_tei.py index 6721c10..acd3520 100644 --- a/conversion_utils/conllu_to_tei.py +++ b/conversion_utils/conllu_to_tei.py @@ -9,15 +9,21 @@ class Sentence: self._id = _id self.items = [] self.links = [] + self.srl_links = [] self.no_ud = no_ud self.system = system def add_item(self, token, lemma, upos, upos_other, xpos, misc): - self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')]) + no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No' + ner = misc['NER'] if 'NER' in misc else 'O' + self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner]) def add_link(self, link_ref, link_type): self.links.append([link_ref, link_type]) + def add_srl_link(self, link_ref, link_type): + self.srl_links.append([link_ref, link_type]) + def as_xml(self, id_prefix=None): if id_prefix: xml_id = id_prefix + '.' + self._id @@ -27,8 +33,24 @@ class Sentence: set_xml_attr(base, 'id', xml_id) id_counter = 1 + in_seg = False + sentence_base = base + for item in self.items: - token, lemma, upos, upos_other, xpos, no_space_after = item + token, lemma, upos, upos_other, xpos, no_space_after, ner = item + + if ner[0] == 'B': + if in_seg: + sentence_base.append(base) + in_seg = True + base = etree.Element('seg') + base.set('type', 'name') + base.set('subtype', f'{ner[2:].lower()}') + elif ner[0] == 'O': + if in_seg: + sentence_base.append(base) + base = sentence_base + in_seg = False if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one to_add = etree.Element('pc') @@ -53,6 +75,11 @@ class Sentence: base.append(to_add) + if in_seg: + sentence_base.append(base) + base = sentence_base + + # depparsing linkGrp link_grp = etree.Element('linkGrp') link_grp.set('corresp', '#'+xml_id) link_grp.set('targFunc', 'head argument') @@ -67,6 +94,23 @@ class Sentence: link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1)) link_grp.append(link) base.append(link_grp) + + # srl linkGrp + if self.srl_links: + link_grp = etree.Element('linkGrp') + link_grp.set('corresp', '#' + xml_id) + link_grp.set('targFunc', 'head argument') + link_grp.set('type', 'SRL') + for link_id, item in enumerate(self.srl_links): + link_ref, link_type = item + link = etree.Element('link') + link.set('ana', 'srl:' + link_type.replace(':', '_')) + if link_ref == u'0': + link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1)) + else: + link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1)) + link_grp.append(link) + base.append(link_grp) return base @@ -234,7 +278,7 @@ def construct_sentence(sent_id, lines): upos_other = tokens[5] depparse_link = tokens[6] depparse_link_name = tokens[7] - misc = tokens[9] + misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} sentence.add_item( token, @@ -247,6 +291,11 @@ def construct_sentence(sent_id, lines): sentence.add_link( depparse_link, depparse_link_name) + + if 'SRL' in misc: + sentence.add_srl_link( + depparse_link, + misc['SRL']) return sentence