Adapted to new TEI version + Some other fixes

This commit is contained in:
Luka 2022-04-22 09:32:06 +02:00
parent 8489bf8264
commit 2e36fd0eaa
4 changed files with 307 additions and 246 deletions

View File

@ -134,23 +134,36 @@ def add_errors(i, error, source, target, edges):
for p_el_l5 in el_l5: for p_el_l5 in el_l5:
if p_el_l5.tag.startswith('w') or p_el_l5.tag.startswith('pc'): if p_el_l5.tag.startswith('w') or p_el_l5.tag.startswith('pc'):
word_combination_L5 += p_el_l5.text + " " word_combination_L5 += p_el_l5.text + " "
for p_el in el: # TODO NOT SURE IF THIS SHOULD BE COMMENTED! IF IT IS NOT THERE ARE ERRORS ON 2ND lvl of errors, where some words are duplicated
if p_el.tag.startswith('w') or p_el.tag.startswith('pc'): # for p_el in el:
ind = str(i) # if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
# ind = str(i)
target_id = "t" + ind #
target.append({"id": target_id, "text": p_el.text + " "}) # target_id = "t" + ind
target_edge_ids.append(target_id) # target.append({"id": target_id, "text": p_el.text + " "})
i += 1 # target_edge_ids.append(target_id)
# i += 1
if word_combination_L1 == word_combination_L2 and word_combination_L2 is not None: if word_combination_L1 == word_combination_L2 and word_combination_L2 is not None:
if label_L2 not in labels:
labels.append(label_L2) labels.append(label_L2)
else:
print(f"REPEATING LABEL - {label_L2} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}")
if word_combination_L1 == word_combination_L3 and word_combination_L3 is not None: if word_combination_L1 == word_combination_L3 and word_combination_L3 is not None:
if label_L3 not in labels:
labels.append(label_L3) labels.append(label_L3)
else:
print(f"REPEATING LABEL - {label_L3} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}")
if word_combination_L1 == word_combination_L4 and word_combination_L4 is not None: if word_combination_L1 == word_combination_L4 and word_combination_L4 is not None:
if label_L4 not in labels:
labels.append(label_L4) labels.append(label_L4)
else:
print(f"REPEATING LABEL - {label_L4} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}")
if word_combination_L1 == word_combination_L5 and word_combination_L5 is not None: if word_combination_L1 == word_combination_L5 and word_combination_L5 is not None:
if label_L5 not in labels:
labels.append(label_L5) labels.append(label_L5)
else:
print(f"REPEATING LABEL - {label_L5} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}")
elif word_combination_L5 is not None: elif word_combination_L5 is not None:
has_error = True has_error = True
elif word_combination_L4 is not None: elif word_combination_L4 is not None:
@ -166,6 +179,29 @@ def add_errors(i, error, source, target, edges):
return i, has_error return i, has_error
def save_file(paragraph_error, output_folder_loc, error_folder_loc, paragraph, dictionary, essay_problematic, dictionary_i):
if not paragraph_error:
if not os.path.exists(output_folder_loc):
os.mkdir(output_folder_loc)
if not os.path.exists(error_folder_loc):
os.mkdir(error_folder_loc)
file_name = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json' if dictionary_i == 1 else paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_P' + str(dictionary_i) + '.json'
with open(os.path.join(output_folder_loc, file_name), 'w') as wf:
json.dump(dictionary, wf, ensure_ascii=False, indent="")
with open(os.path.join(error_folder_loc, file_name), 'w') as wf:
json.dump(dictionary, wf, ensure_ascii=False, indent="")
else:
essay_problematic = True
if not os.path.exists(error_folder_loc):
os.mkdir(error_folder_loc)
file_name = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_problem.json' if dictionary_i == 1 else paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_P' + str(dictionary_i) + '_problem.json'
with open(os.path.join(error_folder_loc, file_name),
'w') as wf:
json.dump(dictionary, wf, ensure_ascii=False, indent="")
return essay_problematic
def process_file(et, args): def process_file(et, args):
if os.path.exists(args.output_folder): if os.path.exists(args.output_folder):
shutil.rmtree(args.output_folder) shutil.rmtree(args.output_folder)
@ -186,6 +222,9 @@ def process_file(et, args):
for paragraph in paragraphs: for paragraph in paragraphs:
sentences = paragraph.findall('s') sentences = paragraph.findall('s')
i = 1 i = 1
dictionary_i = 1
dictionary = []
source = [] source = []
target = [] target = []
edges = {} edges = {}
@ -203,23 +242,19 @@ def process_file(et, args):
if has_error: if has_error:
paragraph_error = True paragraph_error = True
dictionary = {"source": source, "target": target, "edges": edges} # add part of dictionary
if i > dictionary_i * 10000000000000:
essay_problematic = save_file(paragraph_error, output_folder_loc, error_folder_loc, paragraph, {"source": source, "target": target, "edges": edges}, essay_problematic, dictionary_i)
# dictionary.append({"source": source, "target": target, "edges": edges})
dictionary_i += 1
source = []
target = []
edges = {}
paragraph_error = False
if not paragraph_error: # dictionary.append({"source": source, "target": target, "edges": edges})
if not os.path.exists(output_folder_loc):
os.mkdir(output_folder_loc) essay_problematic = save_file(paragraph_error, output_folder_loc, error_folder_loc, paragraph, {"source": source, "target": target, "edges": edges}, essay_problematic, dictionary_i)
if not os.path.exists(error_folder_loc):
os.mkdir(error_folder_loc)
with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
json.dump(dictionary, wf, ensure_ascii=False, indent="")
with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
json.dump(dictionary, wf, ensure_ascii=False, indent="")
else:
essay_problematic = True
if not os.path.exists(error_folder_loc):
os.mkdir(error_folder_loc)
with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_problem.json'), 'w') as wf:
json.dump(dictionary, wf, ensure_ascii=False, indent="")
if not essay_problematic: if not essay_problematic:
shutil.rmtree(error_folder_loc) shutil.rmtree(error_folder_loc)

View File

@ -1,19 +1,25 @@
import argparse import argparse
import re import re
import sys import sys
from conversion_utils.jos_msds_and_properties import Converter, Msd
from conversion_utils.translate_conllu_jos import get_syn_map
from lxml import etree from lxml import etree
class Sentence: class Sentence:
def __init__(self, _id, no_ud=False): def __init__(self, _id, no_ud=False, is_source=None):
self._id = _id self._id = _id
self.items = [] self.items = []
self.links = [] self.links = []
self.no_ud = no_ud self.no_ud = no_ud
self.is_source = is_source
def add_item(self, word_id, token, lemma, upos, upos_other, xpos, misc): # JOS-SYN translations from English to Slovene
self.items.append([word_id, token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')]) self.syn_map = get_syn_map()
def add_item(self, word_id, token, lemma, upos, upos_other, xpos, head, deprel, no_space_after, ner):
self.items.append([word_id, token, lemma, upos, upos_other, xpos, head, deprel, no_space_after, ner])
def add_link(self, link_ref, link_type): def add_link(self, link_ref, link_type):
self.links.append([link_ref, link_type]) self.links.append([link_ref, link_type])
@ -26,36 +32,75 @@ class Sentence:
base = etree.Element('s') base = etree.Element('s')
set_xml_attr(base, 'id', xml_id) set_xml_attr(base, 'id', xml_id)
linkGrp = etree.Element(f'linkGrp')
linkGrp.attrib[f'corresp'] = f'#{xml_id}'
linkGrp.attrib[f'targFunc'] = 'head argument'
linkGrp.attrib[f'type'] = 'JOS-SYN'
ner_seg = None
for item in self.items: for item in self.items:
word_id, token, lemma, upos, upos_other, xpos, no_space_after = item word_id, token, lemma, upos, upos_other, xpos, head, deprel, no_space_after, ner = item
if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
to_add = etree.Element('pc') to_add = etree.Element('pc')
else: else:
to_add = etree.Element('w') to_add = etree.Element('w')
to_add.set('lemma', lemma)
to_add.set('ana', 'mte:' + xpos) to_add.set('ana', 'mte:' + xpos)
if not self.no_ud: if not self.no_ud:
if upos_other != '_': if upos_other != '_':
to_add.set('msd', f'UposTag={upos}|{upos_other}') to_add.set('msd', f'UPosTag={upos}|{upos_other}')
else: else:
to_add.set('msd', f'UposTag={upos}') to_add.set('msd', f'UPosTag={upos}')
set_xml_attr(to_add, 'id', word_id) if xpos not in {'U', 'Z'}:
to_add.set('lemma', lemma)
set_xml_attr(to_add, 'id', "{}.{}".format(xml_id, word_id))
to_add.text = token to_add.text = token
if no_space_after: if no_space_after:
to_add.set('join', 'right') to_add.set('join', 'right')
# handle ner subclass
if ner[0] == 'B':
if ner_seg is not None:
base.append(ner_seg)
del ner_seg
ner_seg = etree.Element('seg')
ner_seg.set('type', f'name')
ner_seg.set('subtype', f'{ner.split("-")[-1].lower()}')
elif ner[0] == 'O':
if ner_seg is not None:
base.append(ner_seg)
del ner_seg
ner_seg = None
if ner_seg is None:
base.append(to_add) base.append(to_add)
else:
ner_seg.append(to_add)
# handle links
link = etree.Element(f'link')
link.attrib['ana'] = f'jos-syn:{self.syn_map[deprel]}'
link.attrib['target'] = f'#{xml_id}.{head} #{xml_id}.{word_id}' if head != 0 else f'#{xml_id} #{xml_id}.{word_id}'
linkGrp.append(link)
if ner_seg is not None:
base.append(ner_seg)
base.append(linkGrp)
return base return base
class Paragraph: class Paragraph:
def __init__(self, _id, _doc_id): def __init__(self, _id, _doc_id, is_source):
self._id = _id if _id is not None else 'no-id' self._id = _id if _id is not None else 'no-id'
_doc_id += 's' if is_source else 't'
self._doc_id = _doc_id if _doc_id is not None else '' self._doc_id = _doc_id if _doc_id is not None else ''
self.sentences = [] self.sentences = []
@ -80,9 +125,9 @@ class Paragraph:
class TeiDocument: class TeiDocument:
def __init__(self, _id, paragraphs=list()): def __init__(self, _id, divs=list()):
self._id = _id self._id = _id
self.paragraphs = paragraphs self.divs = divs
def as_xml(self): def as_xml(self):
root = etree.Element('TEI') root = etree.Element('TEI')
@ -97,8 +142,13 @@ class TeiDocument:
text = etree.SubElement(root, 'text') text = etree.SubElement(root, 'text')
body = etree.SubElement(text, 'body') body = etree.SubElement(text, 'body')
for para in self.paragraphs: for paras, bibl in self.divs:
body.append(para.as_xml()) div = etree.Element('div')
set_xml_attr(div, 'id', xml_id)
div.append(bibl)
for para in paras:
div.append(para.as_xml())
body.append(div)
encoding_desc = etree.SubElement(tei_header, 'encodingDesc') encoding_desc = etree.SubElement(tei_header, 'encodingDesc')
tags_decl = etree.SubElement(encoding_desc, 'tagsDecl') tags_decl = etree.SubElement(encoding_desc, 'tagsDecl')
@ -115,56 +165,90 @@ class TeiDocument:
self.paragraphs.append(paragraph) self.paragraphs.append(paragraph)
def convert_bibl(bibl):
etree_bibl = etree.Element('bibl')
etree_bibl.set('corresp', bibl.get('corresp'))
etree_bibl.set('n', bibl.get('n'))
for bibl_el in bibl:
etree_bibl_el = etree.Element(bibl_el.tag)
etree_bibl_el.text = bibl_el.text
for att, val in bibl_el.attrib.items():
if '{http://www.w3.org/XML/1998/namespace}' in att:
set_xml_attr(etree_bibl_el, att.split('{http://www.w3.org/XML/1998/namespace}')[-1], val)
else:
etree_bibl_el.set(att, val)
etree_bibl.append(etree_bibl_el)
return etree_bibl
def build_tei_etrees(documents): def build_tei_etrees(documents):
elements = [] elements = []
for document in documents: for document in documents:
elements.append(document.as_xml()) elements.append(document.as_xml())
# b = elements[-1]
# a = list(b)
# c = list(b)[0]
# d = list(b)[1]
# for e in d:
# for f in e:
# for g in f:
# print(g)
# d = list(b)[1]
return elements return elements
def build_complete_tei(etree_source, etree_target, etree_links): def build_complete_tei(etree_source, etree_target, etree_links):
root = etree.Element('text') root = etree.Element('TEI')
root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
tei_header = etree.Element('teiHeader')
text = etree.Element('text')
group = etree.Element('group') group = etree.Element('group')
group.append(list(etree_source[0])[1]) group.append(list(etree_source[0])[1])
group.append(list(etree_target[0])[1]) group.append(list(etree_target[0])[1])
# link_text = etree.Element('text') text.append(group)
# link_body = etree.Element('body') root.append(tei_header)
# link_body.append(etree_links) root.append(text)
# link_text.append(link_body) # standoff = etree.Element('standOff')
group.append(etree_links) # standoff.append(etree_links)
root.append(group) # root.append(standoff)
root.append(etree_links)
return root return root
def build_links(all_edges): def build_links(all_edges):
root = etree.Element('text') # root = etree.Element('text')
body = etree.Element('body') # body = etree.Element('body')
body = etree.Element('standOff')
# root.set('xmlns', 'http://www.tei-c.org/ns/1.0') # root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
# set_xml_attr(root, 'lang', 'sl') # set_xml_attr(root, 'lang', 'sl')
# elements = [] # elements = []
for document_edges in all_edges: for document_edges in all_edges:
d = etree.Element('linkGrp') # d = etree.Element('linkGrp')
for paragraph_edges in document_edges: for paragraph_edges in document_edges:
p = etree.Element('linkGrp') # p = etree.Element('linkGrp')
for sentence_edges in paragraph_edges: for sentence_edges in paragraph_edges:
s = etree.Element('linkGrp') s = etree.Element('linkGrp')
random_id = '' random_id = ''
for token_edges in sentence_edges: for token_edges in sentence_edges:
link = etree.Element('link')
link.set('labels', ' '.join(token_edges['labels']))
link.set('sources', ' '.join(['#' + source for source in token_edges['source_ids']]))
link.set('targets', ' '.join(['#' + source for source in token_edges['target_ids']]))
if not random_id: if not random_id:
random_id = token_edges['source_ids'][0] if len(token_edges['source_ids']) > 0 else token_edges['target_ids'][0] random_id = token_edges['source_ids'][0] if len(token_edges['source_ids']) > 0 else token_edges['target_ids'][0]
sentence_id = '.'.join(random_id.split('.')[:3])
link = etree.Element('link')
labels = '|'.join(token_edges['labels']) if len(token_edges['labels']) > 0 else 'ID'
link.set('type', labels)
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
# link.set('target', ' '.join(['#' + source for source in token_edges['target_ids']]))
s.append(link) s.append(link)
set_xml_attr(s, 'sentence_id', '.'.join(random_id.split('.')[:3])) s.set('type', 'CORR')
p.append(s) s.set('targFunc', 'orig reg')
set_xml_attr(p, 'paragraph_id', '.'.join(random_id.split('.')[:2])) s.set('corresp', f'#{sentence_id}')
d.append(p) # body.append(s)
set_xml_attr(d, 'document_id', random_id.split('.')[0]) body.append(s)
body.append(d) # root.append(body)
root.append(body) return body
return root
def set_xml_attr(node, attribute, value): def set_xml_attr(node, attribute, value):
@ -187,90 +271,8 @@ def is_metaline(line):
return False return False
def construct_tei_documents_from_list(object_list): def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, source_id):
documents = [] para = Paragraph(para_id, doc_id, source_id)
doc_id = None
document_paragraphs = []
para_id = None
# para_buffer = []
# for line in object_list:
# if is_metaline(line):
# key, val = parse_metaline(line)
# if key == 'newdoc id':
# if len(para_buffer) > 0:
# document_paragraphs.append(construct_paragraph(para_id, para_buffer))
# if len(document_paragraphs) > 0:
# documents.append(
# TeiDocument(doc_id, document_paragraphs))
# document_paragraphs = []
# doc_id = val
# elif key == 'newpar id':
# if len(para_buffer) > 0:
# document_paragraphs.append(construct_paragraph(para_id, para_buffer))
# para_buffer = []
# para_id = val
# elif key == 'sent_id':
# para_buffer.append(line)
# else:
# if not line.isspace():
# para_buffer.append(line)
if len(object_list) > 0:
document_paragraphs.append(construct_paragraph(doc_id, para_id, object_list))
if len(document_paragraphs) > 0:
documents.append(
TeiDocument(doc_id, document_paragraphs))
return documents
def construct_tei_documents(conllu_lines):
documents = []
doc_id = None
document_paragraphs = []
para_id = None
para_buffer = []
for line in conllu_lines:
if is_metaline(line):
key, val = parse_metaline(line)
if key == 'newdoc id':
if len(para_buffer) > 0:
document_paragraphs.append(construct_paragraph(doc_id, para_id, para_buffer))
if len(document_paragraphs) > 0:
documents.append(
TeiDocument(doc_id, document_paragraphs))
document_paragraphs = []
doc_id = val
elif key == 'newpar id':
if len(para_buffer) > 0:
document_paragraphs.append(construct_paragraph(doc_id, para_id, para_buffer))
para_buffer = []
para_id = val
elif key == 'sent_id':
para_buffer.append(line)
else:
if not line.isspace():
para_buffer.append(line)
if len(para_buffer) > 0:
document_paragraphs.append(construct_paragraph(doc_id, para_id, para_buffer))
if len(document_paragraphs) > 0:
documents.append(
TeiDocument(doc_id, document_paragraphs))
return documents
def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences):
para = Paragraph(para_id, doc_id)
for sentence in etree_source_sentences: for sentence in etree_source_sentences:
para.add_sentence(sentence) para.add_sentence(sentence)
@ -278,8 +280,8 @@ def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences):
return para return para
def construct_paragraph(doc_id, para_id, conllu_lines): def construct_paragraph(doc_id, para_id, conllu_lines, is_source):
para = Paragraph(para_id, doc_id) para = Paragraph(para_id, doc_id, is_source)
sent_id = None sent_id = None
sent_buffer = [] sent_buffer = []
@ -301,16 +303,20 @@ def construct_paragraph(doc_id, para_id, conllu_lines):
return para return para
def construct_sentence_from_list(sent_id, object_list): def construct_sentence_from_list(sent_id, object_list, is_source):
sentence = Sentence(sent_id, no_ud=True) sentence = Sentence(sent_id)
converter = Converter()
for tokens in object_list: for tokens in object_list:
word_id = tokens['id'] word_id = f"{tokens['id']}" if is_source else f"{tokens['id']}"
token = tokens['token'] token = tokens['form']
lemma = tokens['lemma'] lemma = tokens['lemma']
upos = '_' upos = tokens['upos']
xpos = tokens['ana'][4:] xpos = converter.properties_to_msd(converter.msd_to_properties(Msd(tokens['xpos'], 'en'), 'sl', lemma), 'sl').code
upos_other = '_' upos_other = '|'.join([f'{k}={v}' for k, v in tokens['feats'].items()]) if tokens['feats'] else '_'
misc = '_' if tokens['space_after'] else 'SpaceAfter=No' head = tokens['head']
deprel = tokens['deprel']
no_space_after = 'SpaceAfter' in tokens['misc'] and tokens['misc']["SpaceAfter"] == "No"
ner = tokens['misc']['NER']
sentence.add_item( sentence.add_item(
word_id, word_id,
@ -319,7 +325,11 @@ def construct_sentence_from_list(sent_id, object_list):
upos, upos,
upos_other, upos_other,
xpos, xpos,
misc) head,
deprel,
no_space_after,
ner
)
return sentence return sentence
@ -354,49 +364,3 @@ def construct_sentence(sent_id, lines):
depparse_link_name) depparse_link_name)
return sentence return sentence
def construct_tei_etrees(conllu_lines):
documents = construct_tei_documents(conllu_lines)
return build_tei_etrees(documents)
def convert_file(input_file_name, output_file_name):
input_file = open(input_file_name, 'r')
root = construct_tei_etrees(input_file)[0]
tree = etree.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
input_file.close()
tree = etree.ElementTree(root)
tree.write(output_file_name, pretty_print=True, encoding='utf-8')
system = 'jos' # default (TODO: make this cleaner)
if __name__ == '__main__':
import argparse
from glob import glob
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
parser.add_argument('files', nargs='+', help='CoNNL-U file')
parser.add_argument('-o', '--out-file', dest='out', default=None,
help='Write output to file instead of stdout.')
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
args = parser.parse_args()
if args.out:
f_out = open(args.out, 'w')
else:
f_out = sys.stdout
system = args.system
for arg in args.files:
filelist = glob(arg)
for f in filelist:
with open(f, 'r') as conllu_f:
tei_etrees = construct_tei_etrees(conllu_f)
for tei_etree in tei_etrees:
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
f_out.write('')

View File

@ -5,11 +5,15 @@ import os
import shutil import shutil
import time import time
from xml.etree import ElementTree from xml.etree import ElementTree
from conllu import TokenList
import conllu
import classla
import copy
from lxml import etree from lxml import etree
from src.create_tei import construct_tei_etrees, construct_tei_documents_from_list, construct_sentence_from_list, \ from src.create_tei import construct_sentence_from_list, \
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
@ -18,17 +22,22 @@ def add_token(svala_i, source_i, target_i, el, source, target, edges, svala_data
source_id = "s" + svala_i source_id = "s" + svala_i
target_id = "t" + svala_i target_id = "t" + svala_i
edge_id = "e-" + source_id + "-" + target_id edge_id = "e-" + source_id + "-" + target_id
source_token_id = sentence_string_id + f'.s{source_i}' labels = svala_data['edges'][edge_id]['labels']
target_token_id = sentence_string_id + f'.t{target_i}' sentence_string_id_split = sentence_string_id.split('.')
source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{source_i}'
target_token_id = f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{source_i}'
token_tag = 'w' if el.tag.startswith('w') else 'pc' token_tag = 'w' if el.tag.startswith('w') else 'pc'
lemma = el.attrib['lemma'] if token_tag == 'w' else el.text lemma = el.attrib['lemma'] if token_tag == 'w' else el.text
source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False}) source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False})
target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False}) target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False})
edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': svala_data['edges'][edge_id]['labels']}) edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels})
def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source): def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source):
source_token_id = sentence_string_id + f'.s{out_list_i}' if is_source else sentence_string_id + f'.t{out_list_i}' sentence_string_id_split = sentence_string_id.split('.')
source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{out_list_i}' if is_source \
else f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{out_list_i}'
token_tag = 'w' if el.tag.startswith('w') else 'pc' token_tag = 'w' if el.tag.startswith('w') else 'pc'
lemma = el.attrib['lemma'] if token_tag == 'w' else el.text lemma = el.attrib['lemma'] if token_tag == 'w' else el.text
out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False}) out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False})
@ -54,7 +63,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_
source_i += 1 source_i += 1
svala_i += 1 svala_i += 1
elif el.tag.startswith('c'): elif el.tag.startswith('c') and len(source) > 0:
source[-1]['space_after'] = True source[-1]['space_after'] = True
elif el.tag.startswith('p'): elif el.tag.startswith('p'):
@ -70,7 +79,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_
target_i += 1 target_i += 1
svala_i += 1 svala_i += 1
elif p_el.tag.startswith('c'): elif p_el.tag.startswith('c') and len(target) > 0:
target[-1]['space_after'] = True target[-1]['space_after'] = True
elif el.tag.startswith('u2'): elif el.tag.startswith('u2'):
@ -86,7 +95,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_
source_i += 1 source_i += 1
svala_i += 1 svala_i += 1
elif el_l2.tag.startswith('c'): elif el_l2.tag.startswith('c') and len(source) > 0:
source[-1]['space_after'] = True source[-1]['space_after'] = True
elif el_l2.tag.startswith('u3'): elif el_l2.tag.startswith('u3'):
@ -102,7 +111,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_
source_i += 1 source_i += 1
svala_i += 1 svala_i += 1
elif el_l3.tag.startswith('c'): elif el_l3.tag.startswith('c') and len(source) > 0:
source[-1]['space_after'] = True source[-1]['space_after'] = True
elif el_l3.tag.startswith('u4'): elif el_l3.tag.startswith('u4'):
@ -117,7 +126,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_
source_i += 1 source_i += 1
svala_i += 1 svala_i += 1
elif el_l4.tag.startswith('c'): elif el_l4.tag.startswith('c') and len(source) > 0:
source[-1]['space_after'] = True source[-1]['space_after'] = True
elif el_l4.tag.startswith('u5'): elif el_l4.tag.startswith('u5'):
@ -132,22 +141,23 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_
source_i += 1 source_i += 1
svala_i += 1 svala_i += 1
elif el_l5.tag.startswith('c'): elif el_l5.tag.startswith('c') and len(source) > 0:
source[-1]['space_after'] = True source[-1]['space_after'] = True
for p_el in el: # TODO NOT SURE IF THIS SHOULD BE COMMENTED! IF IT IS NOT THERE ARE ERRORS ON 2ND lvl of errors, where some words are duplicated
if p_el.tag.startswith('w') or p_el.tag.startswith('pc'): # for p_el in el:
ind = str(svala_i) # if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
# ind = str(svala_i)
target_id = "t" + ind #
target_edge_ids.append(target_id) # target_id = "t" + ind
# target_edge_ids.append(target_id)
add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) #
# add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False)
target_i += 1 #
svala_i += 1 # target_i += 1
elif p_el.tag.startswith('c'): # svala_i += 1
target[-1]['space_after'] = True # elif p_el.tag.startswith('c') and len(target) > 0:
# target[-1]['space_after'] = True
edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids) edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids)
edge_id = "e-" + "-".join(edge_ids) edge_id = "e-" + "-".join(edge_ids)
@ -156,14 +166,36 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_
return svala_i, source_i, target_i return svala_i, source_i, target_i
def process_file(et, args): def create_conllu(interest_list, sentence_string_id):
conllu_result = TokenList([{"id": token_i + 1, "form": token['token'], "lemma": None, "upos": None, "xpos": None, "feats": None,
"head": None, "deprel": None, "deps": None, "misc": "SpaceAfter=No"} if not token['space_after']
else {"id": token_i + 1, "form": token['token'], "lemma": None, "upos": None, "xpos": None,
"feats": None, "head": None, "deprel": None, "deps": None, "misc": None} for token_i, token in
enumerate(interest_list)])
# Delete last SpaceAfter
misc = conllu_result[len(conllu_result) - 1]['misc'] if len(conllu_result) > 0 else None
if misc is not None:
misc_split = misc.split('|')
if misc is not None and misc == 'SpaceAfter=No':
conllu_result[len(conllu_result) - 1]['misc'] = None
elif misc is not None and 'SpaceAfter=No' in misc_split:
conllu_result[len(conllu_result) - 1]['misc'] = '|'.join([el for el in misc_split if el != 'SpaceAfter=No'])
conllu_result.metadata = {"sent_id": sentence_string_id}
return conllu_result.serialize()
def process_file(et, args, nlp):
if os.path.exists(args.results_folder): if os.path.exists(args.results_folder):
shutil.rmtree(args.results_folder) shutil.rmtree(args.results_folder)
os.mkdir(args.results_folder) os.mkdir(args.results_folder)
etree_source_documents = [] etree_source_documents = []
etree_target_documents = [] etree_target_documents = []
etree_source_paragraphs = [] etree_source_divs = []
etree_target_paragraphs = [] etree_target_divs = []
complete_source_conllu = ''
complete_target_conllu = ''
document_edges = [] document_edges = []
for div in et.iter('div'): for div in et.iter('div'):
@ -179,6 +211,8 @@ def process_file(et, args):
svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)]
svala_dict = {e[0]: e[1] for e in svala_list} svala_dict = {e[0]: e[1] for e in svala_list}
etree_source_paragraphs = []
etree_target_paragraphs = []
paragraph_edges = [] paragraph_edges = []
paragraphs = div.findall('p') paragraphs = div.findall('p')
@ -226,26 +260,55 @@ def process_file(et, args):
target[-1]['space_after'] = True target[-1]['space_after'] = True
sentence_edges.append(edges) sentence_edges.append(edges)
if len(source) > 0:
source_conllu = create_conllu(source, sentence_string_id)
if len(target) > 0:
target_conllu = create_conllu(target, sentence_string_id)
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source)) if len(source) > 0:
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target)) source_conllu_annotated = nlp(source_conllu).to_conll()
if len(target) > 0:
target_conllu_annotated = nlp(target_conllu).to_conll()
etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences)) if len(source) > 0:
etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences)) complete_source_conllu += source_conllu_annotated
complete_target_conllu += target_conllu_annotated
if len(source) > 0:
source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
if len(target) > 0:
target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
if len(source) > 0:
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source_conllu_parsed, True))
if len(target) > 0:
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False))
etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True))
etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False))
paragraph_edges.append(sentence_edges) paragraph_edges.append(sentence_edges)
etree_bibl = convert_bibl(bibl)
etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl)))
etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl)))
document_edges.append(paragraph_edges) document_edges.append(paragraph_edges)
etree_source_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], etree_source_paragraphs)) etree_source_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's', etree_source_divs))
etree_target_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], etree_target_paragraphs)) etree_target_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't', etree_target_divs))
etree_source = build_tei_etrees(etree_source_documents) etree_source = build_tei_etrees(etree_source_documents)
etree_target = build_tei_etrees(etree_target_documents) etree_target = build_tei_etrees(etree_target_documents)
# TODO FIX THIS
etree_links = build_links(document_edges) etree_links = build_links(document_edges)
complete_etree = build_complete_tei(etree_source, etree_target, etree_links) complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links)
with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
sf.write(complete_source_conllu)
with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf:
sf.write(complete_target_conllu)
with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf: with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf:
sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode()) sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())
@ -266,8 +329,9 @@ def process_file(et, args):
def main(args): def main(args):
with open(args.solar_file, 'r') as fp: with open(args.solar_file, 'r') as fp:
logging.info(args.solar_file) logging.info(args.solar_file)
nlp = classla.Pipeline('sl', pos_use_lexicon=True, pos_lemma_pretag=False, tokenize_pretokenized="conllu", type='standard_jos')
et = ElementTree.XML(fp.read()) et = ElementTree.XML(fp.read())
process_file(et, args) process_file(et, args, nlp)
if __name__ == '__main__': if __name__ == '__main__':
@ -275,8 +339,6 @@ if __name__ == '__main__':
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
parser.add_argument('--solar_file', default='data/Solar2.0/solar2.xml', parser.add_argument('--solar_file', default='data/Solar2.0/solar2.xml',
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--txt_file', default='data/txt/input',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--svala_folder', default='data/solar.svala.error.small', parser.add_argument('--svala_folder', default='data/solar.svala.error.small',
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--results_folder', default='data/results/solar3.0', parser.add_argument('--results_folder', default='data/results/solar3.0',

View File