Added NER + SRL to conllu_to_tei script

This commit is contained in:
Luka 2022-11-03 09:31:41 +00:00
parent 2f74dfcab8
commit e636be1dc2
2 changed files with 54 additions and 3 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
*.pyc *.pyc
venv venv
data
.idea

View File

@ -9,15 +9,21 @@ class Sentence:
self._id = _id self._id = _id
self.items = [] self.items = []
self.links = [] self.links = []
self.srl_links = []
self.no_ud = no_ud self.no_ud = no_ud
self.system = system self.system = system
def add_item(self, token, lemma, upos, upos_other, xpos, misc): def add_item(self, token, lemma, upos, upos_other, xpos, misc):
self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')]) no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No'
ner = misc['NER'] if 'NER' in misc else 'O'
self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner])
def add_link(self, link_ref, link_type): def add_link(self, link_ref, link_type):
self.links.append([link_ref, link_type]) self.links.append([link_ref, link_type])
def add_srl_link(self, link_ref, link_type):
self.srl_links.append([link_ref, link_type])
def as_xml(self, id_prefix=None): def as_xml(self, id_prefix=None):
if id_prefix: if id_prefix:
xml_id = id_prefix + '.' + self._id xml_id = id_prefix + '.' + self._id
@ -27,8 +33,24 @@ class Sentence:
set_xml_attr(base, 'id', xml_id) set_xml_attr(base, 'id', xml_id)
id_counter = 1 id_counter = 1
in_seg = False
sentence_base = base
for item in self.items: for item in self.items:
token, lemma, upos, upos_other, xpos, no_space_after = item token, lemma, upos, upos_other, xpos, no_space_after, ner = item
if ner[0] == 'B':
if in_seg:
sentence_base.append(base)
in_seg = True
base = etree.Element('seg')
base.set('type', 'name')
base.set('subtype', f'{ner[2:].lower()}')
elif ner[0] == 'O':
if in_seg:
sentence_base.append(base)
base = sentence_base
in_seg = False
if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
to_add = etree.Element('pc') to_add = etree.Element('pc')
@ -53,6 +75,11 @@ class Sentence:
base.append(to_add) base.append(to_add)
if in_seg:
sentence_base.append(base)
base = sentence_base
# depparsing linkGrp
link_grp = etree.Element('linkGrp') link_grp = etree.Element('linkGrp')
link_grp.set('corresp', '#'+xml_id) link_grp.set('corresp', '#'+xml_id)
link_grp.set('targFunc', 'head argument') link_grp.set('targFunc', 'head argument')
@ -67,6 +94,23 @@ class Sentence:
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1)) link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
link_grp.append(link) link_grp.append(link)
base.append(link_grp) base.append(link_grp)
# srl linkGrp
if self.srl_links:
link_grp = etree.Element('linkGrp')
link_grp.set('corresp', '#' + xml_id)
link_grp.set('targFunc', 'head argument')
link_grp.set('type', 'SRL')
for link_id, item in enumerate(self.srl_links):
link_ref, link_type = item
link = etree.Element('link')
link.set('ana', 'srl:' + link_type.replace(':', '_'))
if link_ref == u'0':
link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
else:
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
link_grp.append(link)
base.append(link_grp)
return base return base
@ -234,7 +278,7 @@ def construct_sentence(sent_id, lines):
upos_other = tokens[5] upos_other = tokens[5]
depparse_link = tokens[6] depparse_link = tokens[6]
depparse_link_name = tokens[7] depparse_link_name = tokens[7]
misc = tokens[9] misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')}
sentence.add_item( sentence.add_item(
token, token,
@ -247,6 +291,11 @@ def construct_sentence(sent_id, lines):
sentence.add_link( sentence.add_link(
depparse_link, depparse_link,
depparse_link_name) depparse_link_name)
if 'SRL' in misc:
sentence.add_srl_link(
depparse_link,
misc['SRL'])
return sentence return sentence