diff --git a/solar2svala.py b/solar2svala.py index 97b28b7..702d4a4 100644 --- a/solar2svala.py +++ b/solar2svala.py @@ -134,23 +134,36 @@ def add_errors(i, error, source, target, edges): for p_el_l5 in el_l5: if p_el_l5.tag.startswith('w') or p_el_l5.tag.startswith('pc'): word_combination_L5 += p_el_l5.text + " " - for p_el in el: - if p_el.tag.startswith('w') or p_el.tag.startswith('pc'): - ind = str(i) - - target_id = "t" + ind - target.append({"id": target_id, "text": p_el.text + " "}) - target_edge_ids.append(target_id) - i += 1 + # TODO NOT SURE IF THIS SHOULD BE COMMENTED! IF IT IS NOT THERE ARE ERRORS ON 2ND lvl of errors, where some words are duplicated + # for p_el in el: + # if p_el.tag.startswith('w') or p_el.tag.startswith('pc'): + # ind = str(i) + # + # target_id = "t" + ind + # target.append({"id": target_id, "text": p_el.text + " "}) + # target_edge_ids.append(target_id) + # i += 1 if word_combination_L1 == word_combination_L2 and word_combination_L2 is not None: - labels.append(label_L2) + if label_L2 not in labels: + labels.append(label_L2) + else: + print(f"REPEATING LABEL - {label_L2} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}") if word_combination_L1 == word_combination_L3 and word_combination_L3 is not None: - labels.append(label_L3) + if label_L3 not in labels: + labels.append(label_L3) + else: + print(f"REPEATING LABEL - {label_L3} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}") if word_combination_L1 == word_combination_L4 and word_combination_L4 is not None: - labels.append(label_L4) + if label_L4 not in labels: + labels.append(label_L4) + else: + print(f"REPEATING LABEL - {label_L4} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}") if word_combination_L1 == word_combination_L5 and word_combination_L5 is not None: - labels.append(label_L5) + if label_L5 not in labels: + labels.append(label_L5) + else: + print(f"REPEATING LABEL - {label_L5} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}") elif word_combination_L5 is not None: has_error = True elif word_combination_L4 is not None: @@ -166,6 +179,29 @@ def add_errors(i, error, source, target, edges): return i, has_error +def save_file(paragraph_error, output_folder_loc, error_folder_loc, paragraph, dictionary, essay_problematic, dictionary_i): + if not paragraph_error: + if not os.path.exists(output_folder_loc): + os.mkdir(output_folder_loc) + if not os.path.exists(error_folder_loc): + os.mkdir(error_folder_loc) + file_name = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json' if dictionary_i == 1 else paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_P' + str(dictionary_i) + '.json' + with open(os.path.join(output_folder_loc, file_name), 'w') as wf: + json.dump(dictionary, wf, ensure_ascii=False, indent="") + with open(os.path.join(error_folder_loc, file_name), 'w') as wf: + json.dump(dictionary, wf, ensure_ascii=False, indent="") + else: + essay_problematic = True + if not os.path.exists(error_folder_loc): + os.mkdir(error_folder_loc) + file_name = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_problem.json' if dictionary_i == 1 else paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_P' + str(dictionary_i) + '_problem.json' + with open(os.path.join(error_folder_loc, file_name), + 'w') as wf: + json.dump(dictionary, wf, ensure_ascii=False, indent="") + + return essay_problematic + + def process_file(et, args): if os.path.exists(args.output_folder): shutil.rmtree(args.output_folder) @@ -186,6 +222,9 @@ def process_file(et, args): for paragraph in paragraphs: sentences = paragraph.findall('s') i = 1 + dictionary_i = 1 + dictionary = [] + source = [] target = [] edges = {} @@ -203,23 +242,19 @@ def process_file(et, args): if has_error: paragraph_error = True - dictionary = {"source": source, "target": target, "edges": edges} - - if not paragraph_error: - if not os.path.exists(output_folder_loc): - os.mkdir(output_folder_loc) - if not os.path.exists(error_folder_loc): - os.mkdir(error_folder_loc) - with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf: - json.dump(dictionary, wf, ensure_ascii=False, indent="") - with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf: - json.dump(dictionary, wf, ensure_ascii=False, indent="") - else: - essay_problematic = True - if not os.path.exists(error_folder_loc): - os.mkdir(error_folder_loc) - with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_problem.json'), 'w') as wf: - json.dump(dictionary, wf, ensure_ascii=False, indent="") + # add part of dictionary + if i > dictionary_i * 10000000000000: + essay_problematic = save_file(paragraph_error, output_folder_loc, error_folder_loc, paragraph, {"source": source, "target": target, "edges": edges}, essay_problematic, dictionary_i) + # dictionary.append({"source": source, "target": target, "edges": edges}) + dictionary_i += 1 + source = [] + target = [] + edges = {} + paragraph_error = False + + # dictionary.append({"source": source, "target": target, "edges": edges}) + + essay_problematic = save_file(paragraph_error, output_folder_loc, error_folder_loc, paragraph, {"source": source, "target": target, "edges": edges}, essay_problematic, dictionary_i) if not essay_problematic: shutil.rmtree(error_folder_loc) diff --git a/src/create_tei.py b/src/create_tei.py index d184d48..e36d9e4 100644 --- a/src/create_tei.py +++ b/src/create_tei.py @@ -1,19 +1,25 @@ import argparse import re import sys +from conversion_utils.jos_msds_and_properties import Converter, Msd +from conversion_utils.translate_conllu_jos import get_syn_map from lxml import etree class Sentence: - def __init__(self, _id, no_ud=False): + def __init__(self, _id, no_ud=False, is_source=None): self._id = _id self.items = [] self.links = [] self.no_ud = no_ud + self.is_source = is_source - def add_item(self, word_id, token, lemma, upos, upos_other, xpos, misc): - self.items.append([word_id, token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')]) + # JOS-SYN translations from English to Slovene + self.syn_map = get_syn_map() + + def add_item(self, word_id, token, lemma, upos, upos_other, xpos, head, deprel, no_space_after, ner): + self.items.append([word_id, token, lemma, upos, upos_other, xpos, head, deprel, no_space_after, ner]) def add_link(self, link_ref, link_type): self.links.append([link_ref, link_type]) @@ -26,36 +32,75 @@ class Sentence: base = etree.Element('s') set_xml_attr(base, 'id', xml_id) + linkGrp = etree.Element(f'linkGrp') + linkGrp.attrib[f'corresp'] = f'#{xml_id}' + linkGrp.attrib[f'targFunc'] = 'head argument' + linkGrp.attrib[f'type'] = 'JOS-SYN' + + ner_seg = None + for item in self.items: - word_id, token, lemma, upos, upos_other, xpos, no_space_after = item + word_id, token, lemma, upos, upos_other, xpos, head, deprel, no_space_after, ner = item if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one to_add = etree.Element('pc') else: to_add = etree.Element('w') - to_add.set('lemma', lemma) to_add.set('ana', 'mte:' + xpos) if not self.no_ud: if upos_other != '_': - to_add.set('msd', f'UposTag={upos}|{upos_other}') + to_add.set('msd', f'UPosTag={upos}|{upos_other}') else: - to_add.set('msd', f'UposTag={upos}') + to_add.set('msd', f'UPosTag={upos}') - set_xml_attr(to_add, 'id', word_id) + if xpos not in {'U', 'Z'}: + to_add.set('lemma', lemma) + + set_xml_attr(to_add, 'id', "{}.{}".format(xml_id, word_id)) to_add.text = token if no_space_after: to_add.set('join', 'right') - base.append(to_add) + # handle ner subclass + if ner[0] == 'B': + if ner_seg is not None: + base.append(ner_seg) + del ner_seg + + ner_seg = etree.Element('seg') + ner_seg.set('type', f'name') + ner_seg.set('subtype', f'{ner.split("-")[-1].lower()}') + elif ner[0] == 'O': + if ner_seg is not None: + base.append(ner_seg) + del ner_seg + ner_seg = None + + if ner_seg is None: + base.append(to_add) + else: + ner_seg.append(to_add) + + # handle links + link = etree.Element(f'link') + link.attrib['ana'] = f'jos-syn:{self.syn_map[deprel]}' + link.attrib['target'] = f'#{xml_id}.{head} #{xml_id}.{word_id}' if head != 0 else f'#{xml_id} #{xml_id}.{word_id}' + linkGrp.append(link) + + if ner_seg is not None: + base.append(ner_seg) + + base.append(linkGrp) return base class Paragraph: - def __init__(self, _id, _doc_id): + def __init__(self, _id, _doc_id, is_source): self._id = _id if _id is not None else 'no-id' + _doc_id += 's' if is_source else 't' self._doc_id = _doc_id if _doc_id is not None else '' self.sentences = [] @@ -80,9 +125,9 @@ class Paragraph: class TeiDocument: - def __init__(self, _id, paragraphs=list()): + def __init__(self, _id, divs=list()): self._id = _id - self.paragraphs = paragraphs + self.divs = divs def as_xml(self): root = etree.Element('TEI') @@ -97,8 +142,13 @@ class TeiDocument: text = etree.SubElement(root, 'text') body = etree.SubElement(text, 'body') - for para in self.paragraphs: - body.append(para.as_xml()) + for paras, bibl in self.divs: + div = etree.Element('div') + set_xml_attr(div, 'id', xml_id) + div.append(bibl) + for para in paras: + div.append(para.as_xml()) + body.append(div) encoding_desc = etree.SubElement(tei_header, 'encodingDesc') tags_decl = etree.SubElement(encoding_desc, 'tagsDecl') @@ -115,56 +165,90 @@ class TeiDocument: self.paragraphs.append(paragraph) +def convert_bibl(bibl): + etree_bibl = etree.Element('bibl') + etree_bibl.set('corresp', bibl.get('corresp')) + etree_bibl.set('n', bibl.get('n')) + for bibl_el in bibl: + etree_bibl_el = etree.Element(bibl_el.tag) + etree_bibl_el.text = bibl_el.text + for att, val in bibl_el.attrib.items(): + if '{http://www.w3.org/XML/1998/namespace}' in att: + set_xml_attr(etree_bibl_el, att.split('{http://www.w3.org/XML/1998/namespace}')[-1], val) + else: + etree_bibl_el.set(att, val) + etree_bibl.append(etree_bibl_el) + return etree_bibl + + def build_tei_etrees(documents): elements = [] for document in documents: elements.append(document.as_xml()) + # b = elements[-1] + # a = list(b) + # c = list(b)[0] + # d = list(b)[1] + # for e in d: + # for f in e: + # for g in f: + # print(g) + # d = list(b)[1] return elements + def build_complete_tei(etree_source, etree_target, etree_links): - root = etree.Element('text') + root = etree.Element('TEI') + root.set('xmlns', 'http://www.tei-c.org/ns/1.0') + tei_header = etree.Element('teiHeader') + text = etree.Element('text') group = etree.Element('group') group.append(list(etree_source[0])[1]) group.append(list(etree_target[0])[1]) - # link_text = etree.Element('text') - # link_body = etree.Element('body') - # link_body.append(etree_links) - # link_text.append(link_body) - group.append(etree_links) - root.append(group) - + text.append(group) + root.append(tei_header) + root.append(text) + # standoff = etree.Element('standOff') + # standoff.append(etree_links) + # root.append(standoff) + root.append(etree_links) return root + def build_links(all_edges): - root = etree.Element('text') - body = etree.Element('body') + # root = etree.Element('text') + # body = etree.Element('body') + body = etree.Element('standOff') # root.set('xmlns', 'http://www.tei-c.org/ns/1.0') # set_xml_attr(root, 'lang', 'sl') # elements = [] for document_edges in all_edges: - d = etree.Element('linkGrp') + # d = etree.Element('linkGrp') for paragraph_edges in document_edges: - p = etree.Element('linkGrp') + # p = etree.Element('linkGrp') for sentence_edges in paragraph_edges: s = etree.Element('linkGrp') + random_id = '' for token_edges in sentence_edges: - link = etree.Element('link') - link.set('labels', ' '.join(token_edges['labels'])) - link.set('sources', ' '.join(['#' + source for source in token_edges['source_ids']])) - link.set('targets', ' '.join(['#' + source for source in token_edges['target_ids']])) if not random_id: random_id = token_edges['source_ids'][0] if len(token_edges['source_ids']) > 0 else token_edges['target_ids'][0] + sentence_id = '.'.join(random_id.split('.')[:3]) + link = etree.Element('link') + labels = '|'.join(token_edges['labels']) if len(token_edges['labels']) > 0 else 'ID' + link.set('type', labels) + link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']])) + # link.set('target', ' '.join(['#' + source for source in token_edges['target_ids']])) + s.append(link) - set_xml_attr(s, 'sentence_id', '.'.join(random_id.split('.')[:3])) - p.append(s) - set_xml_attr(p, 'paragraph_id', '.'.join(random_id.split('.')[:2])) - d.append(p) - set_xml_attr(d, 'document_id', random_id.split('.')[0]) - body.append(d) - root.append(body) - return root + s.set('type', 'CORR') + s.set('targFunc', 'orig reg') + s.set('corresp', f'#{sentence_id}') + # body.append(s) + body.append(s) + # root.append(body) + return body def set_xml_attr(node, attribute, value): @@ -187,90 +271,8 @@ def is_metaline(line): return False -def construct_tei_documents_from_list(object_list): - documents = [] - - doc_id = None - document_paragraphs = [] - - para_id = None - # para_buffer = [] - - # for line in object_list: - # if is_metaline(line): - # key, val = parse_metaline(line) - # if key == 'newdoc id': - # if len(para_buffer) > 0: - # document_paragraphs.append(construct_paragraph(para_id, para_buffer)) - # if len(document_paragraphs) > 0: - # documents.append( - # TeiDocument(doc_id, document_paragraphs)) - # document_paragraphs = [] - # doc_id = val - # elif key == 'newpar id': - # if len(para_buffer) > 0: - # document_paragraphs.append(construct_paragraph(para_id, para_buffer)) - # para_buffer = [] - # para_id = val - # elif key == 'sent_id': - # para_buffer.append(line) - # else: - # if not line.isspace(): - # para_buffer.append(line) - - if len(object_list) > 0: - document_paragraphs.append(construct_paragraph(doc_id, para_id, object_list)) - - if len(document_paragraphs) > 0: - documents.append( - TeiDocument(doc_id, document_paragraphs)) - - return documents - - -def construct_tei_documents(conllu_lines): - documents = [] - - doc_id = None - document_paragraphs = [] - - para_id = None - para_buffer = [] - - for line in conllu_lines: - if is_metaline(line): - key, val = parse_metaline(line) - if key == 'newdoc id': - if len(para_buffer) > 0: - document_paragraphs.append(construct_paragraph(doc_id, para_id, para_buffer)) - if len(document_paragraphs) > 0: - documents.append( - TeiDocument(doc_id, document_paragraphs)) - document_paragraphs = [] - doc_id = val - elif key == 'newpar id': - if len(para_buffer) > 0: - document_paragraphs.append(construct_paragraph(doc_id, para_id, para_buffer)) - para_buffer = [] - para_id = val - elif key == 'sent_id': - para_buffer.append(line) - else: - if not line.isspace(): - para_buffer.append(line) - - if len(para_buffer) > 0: - document_paragraphs.append(construct_paragraph(doc_id, para_id, para_buffer)) - - if len(document_paragraphs) > 0: - documents.append( - TeiDocument(doc_id, document_paragraphs)) - - return documents - - -def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences): - para = Paragraph(para_id, doc_id) +def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, source_id): + para = Paragraph(para_id, doc_id, source_id) for sentence in etree_source_sentences: para.add_sentence(sentence) @@ -278,8 +280,8 @@ def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences): return para -def construct_paragraph(doc_id, para_id, conllu_lines): - para = Paragraph(para_id, doc_id) +def construct_paragraph(doc_id, para_id, conllu_lines, is_source): + para = Paragraph(para_id, doc_id, is_source) sent_id = None sent_buffer = [] @@ -301,16 +303,20 @@ def construct_paragraph(doc_id, para_id, conllu_lines): return para -def construct_sentence_from_list(sent_id, object_list): - sentence = Sentence(sent_id, no_ud=True) +def construct_sentence_from_list(sent_id, object_list, is_source): + sentence = Sentence(sent_id) + converter = Converter() for tokens in object_list: - word_id = tokens['id'] - token = tokens['token'] + word_id = f"{tokens['id']}" if is_source else f"{tokens['id']}" + token = tokens['form'] lemma = tokens['lemma'] - upos = '_' - xpos = tokens['ana'][4:] - upos_other = '_' - misc = '_' if tokens['space_after'] else 'SpaceAfter=No' + upos = tokens['upos'] + xpos = converter.properties_to_msd(converter.msd_to_properties(Msd(tokens['xpos'], 'en'), 'sl', lemma), 'sl').code + upos_other = '|'.join([f'{k}={v}' for k, v in tokens['feats'].items()]) if tokens['feats'] else '_' + head = tokens['head'] + deprel = tokens['deprel'] + no_space_after = 'SpaceAfter' in tokens['misc'] and tokens['misc']["SpaceAfter"] == "No" + ner = tokens['misc']['NER'] sentence.add_item( word_id, @@ -319,7 +325,11 @@ def construct_sentence_from_list(sent_id, object_list): upos, upos_other, xpos, - misc) + head, + deprel, + no_space_after, + ner + ) return sentence @@ -354,49 +364,3 @@ def construct_sentence(sent_id, lines): depparse_link_name) return sentence - -def construct_tei_etrees(conllu_lines): - documents = construct_tei_documents(conllu_lines) - return build_tei_etrees(documents) - - -def convert_file(input_file_name, output_file_name): - input_file = open(input_file_name, 'r') - root = construct_tei_etrees(input_file)[0] - tree = etree.ElementTree(root) - tree.write(output_file_name, encoding='UTF-8', pretty_print=True) - input_file.close() - - tree = etree.ElementTree(root) - tree.write(output_file_name, pretty_print=True, encoding='utf-8') - - -system = 'jos' # default (TODO: make this cleaner) - -if __name__ == '__main__': - import argparse - from glob import glob - - parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.') - parser.add_argument('files', nargs='+', help='CoNNL-U file') - parser.add_argument('-o', '--out-file', dest='out', default=None, - help='Write output to file instead of stdout.') - parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud']) - - args = parser.parse_args() - - if args.out: - f_out = open(args.out, 'w') - else: - f_out = sys.stdout - - system = args.system - - for arg in args.files: - filelist = glob(arg) - for f in filelist: - with open(f, 'r') as conllu_f: - tei_etrees = construct_tei_etrees(conllu_f) - for tei_etree in tei_etrees: - f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode()) - f_out.write('') diff --git a/svala2tei.py b/svala2tei.py index 1c38d24..729f65d 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -5,11 +5,15 @@ import os import shutil import time from xml.etree import ElementTree +from conllu import TokenList +import conllu +import classla +import copy from lxml import etree -from src.create_tei import construct_tei_etrees, construct_tei_documents_from_list, construct_sentence_from_list, \ - construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei +from src.create_tei import construct_sentence_from_list, \ + construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl logging.basicConfig(level=logging.INFO) @@ -18,17 +22,22 @@ def add_token(svala_i, source_i, target_i, el, source, target, edges, svala_data source_id = "s" + svala_i target_id = "t" + svala_i edge_id = "e-" + source_id + "-" + target_id - source_token_id = sentence_string_id + f'.s{source_i}' - target_token_id = sentence_string_id + f'.t{target_i}' + labels = svala_data['edges'][edge_id]['labels'] + sentence_string_id_split = sentence_string_id.split('.') + source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{source_i}' + target_token_id = f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{source_i}' token_tag = 'w' if el.tag.startswith('w') else 'pc' lemma = el.attrib['lemma'] if token_tag == 'w' else el.text source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False}) target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False}) - edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': svala_data['edges'][edge_id]['labels']}) + edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels}) def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source): - source_token_id = sentence_string_id + f'.s{out_list_i}' if is_source else sentence_string_id + f'.t{out_list_i}' + sentence_string_id_split = sentence_string_id.split('.') + + source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{out_list_i}' if is_source \ + else f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{out_list_i}' token_tag = 'w' if el.tag.startswith('w') else 'pc' lemma = el.attrib['lemma'] if token_tag == 'w' else el.text out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False}) @@ -54,7 +63,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_ source_i += 1 svala_i += 1 - elif el.tag.startswith('c'): + elif el.tag.startswith('c') and len(source) > 0: source[-1]['space_after'] = True elif el.tag.startswith('p'): @@ -70,7 +79,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_ target_i += 1 svala_i += 1 - elif p_el.tag.startswith('c'): + elif p_el.tag.startswith('c') and len(target) > 0: target[-1]['space_after'] = True elif el.tag.startswith('u2'): @@ -86,7 +95,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_ source_i += 1 svala_i += 1 - elif el_l2.tag.startswith('c'): + elif el_l2.tag.startswith('c') and len(source) > 0: source[-1]['space_after'] = True elif el_l2.tag.startswith('u3'): @@ -102,7 +111,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_ source_i += 1 svala_i += 1 - elif el_l3.tag.startswith('c'): + elif el_l3.tag.startswith('c') and len(source) > 0: source[-1]['space_after'] = True elif el_l3.tag.startswith('u4'): @@ -117,7 +126,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_ source_i += 1 svala_i += 1 - elif el_l4.tag.startswith('c'): + elif el_l4.tag.startswith('c') and len(source) > 0: source[-1]['space_after'] = True elif el_l4.tag.startswith('u5'): @@ -132,22 +141,23 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_ source_i += 1 svala_i += 1 - elif el_l5.tag.startswith('c'): + elif el_l5.tag.startswith('c') and len(source) > 0: source[-1]['space_after'] = True - for p_el in el: - if p_el.tag.startswith('w') or p_el.tag.startswith('pc'): - ind = str(svala_i) - - target_id = "t" + ind - target_edge_ids.append(target_id) - - add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) - - target_i += 1 - svala_i += 1 - elif p_el.tag.startswith('c'): - target[-1]['space_after'] = True + # TODO NOT SURE IF THIS SHOULD BE COMMENTED! IF IT IS NOT THERE ARE ERRORS ON 2ND lvl of errors, where some words are duplicated + # for p_el in el: + # if p_el.tag.startswith('w') or p_el.tag.startswith('pc'): + # ind = str(svala_i) + # + # target_id = "t" + ind + # target_edge_ids.append(target_id) + # + # add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) + # + # target_i += 1 + # svala_i += 1 + # elif p_el.tag.startswith('c') and len(target) > 0: + # target[-1]['space_after'] = True edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids) edge_id = "e-" + "-".join(edge_ids) @@ -156,14 +166,36 @@ def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_ return svala_i, source_i, target_i -def process_file(et, args): +def create_conllu(interest_list, sentence_string_id): + conllu_result = TokenList([{"id": token_i + 1, "form": token['token'], "lemma": None, "upos": None, "xpos": None, "feats": None, + "head": None, "deprel": None, "deps": None, "misc": "SpaceAfter=No"} if not token['space_after'] + else {"id": token_i + 1, "form": token['token'], "lemma": None, "upos": None, "xpos": None, + "feats": None, "head": None, "deprel": None, "deps": None, "misc": None} for token_i, token in + enumerate(interest_list)]) + # Delete last SpaceAfter + misc = conllu_result[len(conllu_result) - 1]['misc'] if len(conllu_result) > 0 else None + if misc is not None: + misc_split = misc.split('|') + if misc is not None and misc == 'SpaceAfter=No': + conllu_result[len(conllu_result) - 1]['misc'] = None + elif misc is not None and 'SpaceAfter=No' in misc_split: + conllu_result[len(conllu_result) - 1]['misc'] = '|'.join([el for el in misc_split if el != 'SpaceAfter=No']) + conllu_result.metadata = {"sent_id": sentence_string_id} + + return conllu_result.serialize() + + +def process_file(et, args, nlp): if os.path.exists(args.results_folder): shutil.rmtree(args.results_folder) os.mkdir(args.results_folder) etree_source_documents = [] etree_target_documents = [] - etree_source_paragraphs = [] - etree_target_paragraphs = [] + etree_source_divs = [] + etree_target_divs = [] + + complete_source_conllu = '' + complete_target_conllu = '' document_edges = [] for div in et.iter('div'): @@ -179,6 +211,8 @@ def process_file(et, args): svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] svala_dict = {e[0]: e[1] for e in svala_list} + etree_source_paragraphs = [] + etree_target_paragraphs = [] paragraph_edges = [] paragraphs = div.findall('p') @@ -226,26 +260,55 @@ def process_file(et, args): target[-1]['space_after'] = True sentence_edges.append(edges) - - etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source)) - etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target)) - - etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences)) - etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences)) + if len(source) > 0: + source_conllu = create_conllu(source, sentence_string_id) + if len(target) > 0: + target_conllu = create_conllu(target, sentence_string_id) + + if len(source) > 0: + source_conllu_annotated = nlp(source_conllu).to_conll() + if len(target) > 0: + target_conllu_annotated = nlp(target_conllu).to_conll() + + if len(source) > 0: + complete_source_conllu += source_conllu_annotated + complete_target_conllu += target_conllu_annotated + + if len(source) > 0: + source_conllu_parsed = conllu.parse(source_conllu_annotated)[0] + if len(target) > 0: + target_conllu_parsed = conllu.parse(target_conllu_annotated)[0] + + if len(source) > 0: + etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source_conllu_parsed, True)) + if len(target) > 0: + etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False)) + + etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True)) + etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False)) paragraph_edges.append(sentence_edges) + etree_bibl = convert_bibl(bibl) + + etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl))) + etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl))) document_edges.append(paragraph_edges) - etree_source_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], etree_source_paragraphs)) - etree_target_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], etree_target_paragraphs)) + etree_source_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's', etree_source_divs)) + etree_target_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't', etree_target_divs)) etree_source = build_tei_etrees(etree_source_documents) etree_target = build_tei_etrees(etree_target_documents) - # TODO FIX THIS etree_links = build_links(document_edges) - complete_etree = build_complete_tei(etree_source, etree_target, etree_links) + complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links) + + with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf: + sf.write(complete_source_conllu) + + with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf: + sf.write(complete_target_conllu) with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf: sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode()) @@ -266,8 +329,9 @@ def process_file(et, args): def main(args): with open(args.solar_file, 'r') as fp: logging.info(args.solar_file) + nlp = classla.Pipeline('sl', pos_use_lexicon=True, pos_lemma_pretag=False, tokenize_pretokenized="conllu", type='standard_jos') et = ElementTree.XML(fp.read()) - process_file(et, args) + process_file(et, args, nlp) if __name__ == '__main__': @@ -275,8 +339,6 @@ if __name__ == '__main__': description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') parser.add_argument('--solar_file', default='data/Solar2.0/solar2.xml', help='input file in (gz or xml currently). If none, then just database is loaded') - parser.add_argument('--txt_file', default='data/txt/input', - help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--svala_folder', default='data/solar.svala.error.small', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--results_folder', default='data/results/solar3.0', diff --git a/svala_formatter/__init__.py b/svala_formatter/__init__.py new file mode 100644 index 0000000..e69de29