Added svala2tei code.

2022-02-22 10:35:01 +01:00 · 2022-02-22 10:35:01 +01:00 · ab292d74f0
commit ab292d74f0
parent 4b88ada956
5 changed files with 619 additions and 0 deletions
--- a/src/init.py
+++ b/src/init.py
--- a/src/pycache/init.cpython-37.pyc
+++ b/src/pycache/init.cpython-37.pyc
--- a/src/pycache/create_tei.cpython-37.pyc
+++ b/src/pycache/create_tei.cpython-37.pyc
--- a/src/create_tei.py
+++ b/src/create_tei.py
@ -0,0 +1,355 @@
+import argparse
+import re
+import sys
+
+from lxml import etree
+
+
+class Sentence:
+    def __init__(self, _id, no_ud=False):
+        self._id = _id
+        self.items = []
+        self.links = []
+        self.no_ud = no_ud
+
+    def add_item(self, token, lemma, upos, upos_other, xpos, misc):
+        self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')])
+
+    def add_link(self, link_ref, link_type):
+        self.links.append([link_ref, link_type])
+
+    def as_xml(self, id_prefix=None):
+        if id_prefix:
+            xml_id = id_prefix + '.' + self._id
+        else:
+            xml_id = self._id
+        base = etree.Element('s')
+        set_xml_attr(base, 'id', xml_id)
+        id_counter = 1
+
+        for item in self.items:
+            token, lemma, upos, upos_other, xpos, no_space_after = item
+
+            if xpos in {'U', 'Z'}:  # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
+                to_add = etree.Element('pc')
+            else:
+                to_add = etree.Element('w')
+                to_add.set('lemma', lemma)
+
+            to_add.set('ana', 'mte:' + xpos)
+            if not self.no_ud:
+                if upos_other != '_':
+                    to_add.set('msd', f'UposTag={upos}|{upos_other}')
+                else:
+                    to_add.set('msd', f'UposTag={upos}')
+
+            set_xml_attr(to_add, 'id', "{}.{}".format(xml_id, id_counter))
+            to_add.text = token
+
+            id_counter += 1
+
+            if no_space_after:
+                to_add.set('join', 'right')
+
+            base.append(to_add)
+
+        return base
+
+
+class Paragraph:
+    def __init__(self, _id):
+        self._id = _id if _id is not None else 'no-id'
+        self.sentences = []
+
+    def add_sentence(self, sentence):
+        self.sentences.append(sentence)
+
+    def as_xml(self, id_prefix=None):
+        if id_prefix:
+            xml_id = id_prefix + '.' + self._id
+        else:
+            xml_id = self._id
+
+        p = etree.Element('p')
+        set_xml_attr(p, 'id', xml_id)
+
+        for sent in self.sentences:
+            p.append(sent.as_xml(id_prefix=xml_id))
+        return p
+
+
+class TeiDocument:
+    def __init__(self, _id, paragraphs=list()):
+        self._id = _id
+        self.paragraphs = paragraphs
+
+    def as_xml(self):
+        root = etree.Element('TEI')
+        root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
+        set_xml_attr(root, 'lang', 'sl')
+
+        xml_id = self._id
+        if xml_id is not None:
+            set_xml_attr(root, 'id', xml_id)
+
+        tei_header = etree.SubElement(root, 'teiHeader')
+
+        text = etree.SubElement(root, 'text')
+        body = etree.SubElement(text, 'body')
+        for para in self.paragraphs:
+            body.append(para.as_xml(id_prefix=xml_id))
+
+        encoding_desc = etree.SubElement(tei_header, 'encodingDesc')
+        tags_decl = etree.SubElement(encoding_desc, 'tagsDecl')
+        namespace = etree.SubElement(tags_decl, 'namespace')
+        namespace.set('name', 'http://www.tei-c.org/ns/1.0')
+        for tag in ['p', 's', 'pc', 'w']:
+            count = int(text.xpath('count(.//{})'.format(tag)))
+            tag_usage = etree.SubElement(namespace, 'tagUsage')
+            tag_usage.set('gi', tag)
+            tag_usage.set('occurs', str(count))
+        return root
+
+    def add_paragraph(self, paragraph):
+        self.paragraphs.append(paragraph)
+
+
+def build_tei_etrees(documents):
+    elements = []
+    for document in documents:
+        elements.append(document.as_xml())
+    return elements
+
+
+def set_xml_attr(node, attribute, value):
+    node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value
+
+
+def parse_metaline(line):
+    tokens = line.split('=', 1)
+    key = tokens[0].replace('#', '').strip()
+    if len(tokens) > 1 and not tokens[1].isspace():
+        val = tokens[1].strip()
+    else:
+        val = None
+    return (key, val)
+
+
+def is_metaline(line):
+    if re.match('# .+ =.*', line):
+        return True
+    return False
+
+
+def construct_tei_documents_from_list(object_list):
+    documents = []
+
+    doc_id = None
+    document_paragraphs = []
+
+    para_id = None
+    # para_buffer = []
+
+    # for line in object_list:
+    #     if is_metaline(line):
+    #         key, val = parse_metaline(line)
+    #         if key == 'newdoc id':
+    #             if len(para_buffer) > 0:
+    #                 document_paragraphs.append(construct_paragraph(para_id, para_buffer))
+    #             if len(document_paragraphs) > 0:
+    #                 documents.append(
+    #                     TeiDocument(doc_id, document_paragraphs))
+    #                 document_paragraphs = []
+    #             doc_id = val
+    #         elif key == 'newpar id':
+    #             if len(para_buffer) > 0:
+    #                 document_paragraphs.append(construct_paragraph(para_id, para_buffer))
+    #                 para_buffer = []
+    #             para_id = val
+    #         elif key == 'sent_id':
+    #             para_buffer.append(line)
+    #     else:
+    #         if not line.isspace():
+    #             para_buffer.append(line)
+
+    if len(object_list) > 0:
+        document_paragraphs.append(construct_paragraph(para_id, object_list))
+
+    if len(document_paragraphs) > 0:
+        documents.append(
+            TeiDocument(doc_id, document_paragraphs))
+
+    return documents
+
+
+def construct_tei_documents(conllu_lines):
+    documents = []
+
+    doc_id = None
+    document_paragraphs = []
+
+    para_id = None
+    para_buffer = []
+
+    for line in conllu_lines:
+        if is_metaline(line):
+            key, val = parse_metaline(line)
+            if key == 'newdoc id':
+                if len(para_buffer) > 0:
+                    document_paragraphs.append(construct_paragraph(para_id, para_buffer))
+                if len(document_paragraphs) > 0:
+                    documents.append(
+                        TeiDocument(doc_id, document_paragraphs))
+                    document_paragraphs = []
+                doc_id = val
+            elif key == 'newpar id':
+                if len(para_buffer) > 0:
+                    document_paragraphs.append(construct_paragraph(para_id, para_buffer))
+                    para_buffer = []
+                para_id = val
+            elif key == 'sent_id':
+                para_buffer.append(line)
+        else:
+            if not line.isspace():
+                para_buffer.append(line)
+
+    if len(para_buffer) > 0:
+        document_paragraphs.append(construct_paragraph(para_id, para_buffer))
+
+    if len(document_paragraphs) > 0:
+        documents.append(
+            TeiDocument(doc_id, document_paragraphs))
+
+    return documents
+
+
+def construct_paragraph_from_list(para_id, etree_source_sentences):
+    para = Paragraph(para_id)
+
+    for sentence in etree_source_sentences:
+        para.add_sentence(sentence)
+
+    return para
+
+
+def construct_paragraph(para_id, conllu_lines):
+    para = Paragraph(para_id)
+
+    sent_id = None
+    sent_buffer = []
+
+    for line in conllu_lines:
+        if is_metaline(line):
+            key, val = parse_metaline(line)
+            if key == 'sent_id':
+                if len(sent_buffer) > 0:
+                    para.add_sentence(construct_sentence(sent_id, sent_buffer))
+                    sent_buffer = []
+                sent_id = val
+        elif not line.isspace():
+            sent_buffer.append(line)
+
+    if len(sent_buffer) > 0:
+        para.add_sentence(construct_sentence(sent_id, sent_buffer))
+
+    return para
+
+
+def construct_sentence_from_list(sent_id, object_list):
+    sentence = Sentence(sent_id, no_ud=True)
+    for tokens in object_list:
+        word_id = tokens['id']
+        token = tokens['token']
+        lemma = tokens['lemma']
+        upos = '_'
+        xpos = tokens['ana'][4:]
+        upos_other = '_'
+        misc = '_' if tokens['space_after'] else 'SpaceAfter=No'
+
+        sentence.add_item(
+            token,
+            lemma,
+            upos,
+            upos_other,
+            xpos,
+            misc)
+
+    return sentence
+
+
+def construct_sentence(sent_id, lines):
+    sentence = Sentence(sent_id)
+    for line in lines:
+        if line.startswith('#') or line.isspace():
+            continue
+        line = line.replace('\n', '')
+        tokens = line.split('\t')
+        word_id = tokens[0]
+        token = tokens[1]
+        lemma = tokens[2]
+        upos = tokens[3]
+        xpos = tokens[4]
+        upos_other = tokens[5]
+        depparse_link = tokens[6]
+        depparse_link_name = tokens[7]
+        misc = tokens[9]
+
+        sentence.add_item(
+            token,
+            lemma,
+            upos,
+            upos_other,
+            xpos,
+            misc)
+
+        sentence.add_link(
+            depparse_link,
+            depparse_link_name)
+    return sentence
+
+
+def construct_tei_etrees(conllu_lines):
+    documents = construct_tei_documents(conllu_lines)
+    return build_tei_etrees(documents)
+
+
+def convert_file(input_file_name, output_file_name):
+    input_file = open(input_file_name, 'r')
+    root = construct_tei_etrees(input_file)[0]
+    tree = etree.ElementTree(root)
+    tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
+    input_file.close()
+
+    tree = etree.ElementTree(root)
+    tree.write(output_file_name, pretty_print=True, encoding='utf-8')
+
+
+system = 'jos'  # default (TODO: make this cleaner)
+
+if __name__ == '__main__':
+    import argparse
+    from glob import glob
+
+    parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
+    parser.add_argument('files', nargs='+', help='CoNNL-U file')
+    parser.add_argument('-o', '--out-file', dest='out', default=None,
+                        help='Write output to file instead of stdout.')
+    parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
+
+    args = parser.parse_args()
+
+    if args.out:
+        f_out = open(args.out, 'w')
+    else:
+        f_out = sys.stdout
+
+    system = args.system
+
+    for arg in args.files:
+        filelist = glob(arg)
+        for f in filelist:
+            with open(f, 'r') as conllu_f:
+                tei_etrees = construct_tei_etrees(conllu_f)
+            for tei_etree in tei_etrees:
+                f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
+                f_out.write('')
--- a/svala2tei.py
+++ b/svala2tei.py
@ -0,0 +1,264 @@
+import argparse
+import json
+import logging
+import os
+import shutil
+import time
+from xml.etree import ElementTree
+
+from lxml import etree
+
+from src.create_tei import construct_tei_etrees, construct_tei_documents_from_list, construct_sentence_from_list, \
+    construct_paragraph_from_list, TeiDocument, build_tei_etrees
+
+logging.basicConfig(level=logging.INFO)
+
+
+def add_token(svala_i, source_i, target_i, el, source, target, edges, svala_data, sentence_string_source_id, sentence_string_target_id):
+    source_id = "s" + svala_i
+    target_id = "t" + svala_i
+    edge_id = "e-" + source_id + "-" + target_id
+    source_token_id = sentence_string_source_id + f'.{source_i}'
+    target_token_id = sentence_string_target_id + f'.{target_i}'
+    token_tag = 'w' if el.tag.startswith('w') else 'pc'
+    lemma = el.attrib['lemma'] if token_tag == 'w' else el.text
+    source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False})
+    target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False})
+    edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': svala_data['edges'][edge_id]['labels']})
+
+
+def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids):
+    source_token_id = sentence_string_id + f'.{out_list_i}'
+    token_tag = 'w' if el.tag.startswith('w') else 'pc'
+    lemma = el.attrib['lemma'] if token_tag == 'w' else el.text
+    out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False})
+    out_list_ids.append(source_token_id)
+
+
+def add_errors(svala_i, source_i, target_i, error, source, target, edges, svala_data, sentence_string_source_id, sentence_string_target_id):
+    source_edge_ids = []
+    target_edge_ids = []
+    source_ids = []
+    target_ids = []
+
+    # solar5.7
+    for el in error:
+        if el.tag.startswith('w') or el.tag.startswith('pc'):
+            ind = str(svala_i)
+
+            source_id = "s" + ind
+            source_edge_ids.append(source_id)
+
+            add_error_token(el, source, sentence_string_source_id, source_i, source_ids)
+
+            source_i += 1
+            svala_i += 1
+
+        elif el.tag.startswith('c'):
+            source[-1]['space_after'] = True
+
+        elif el.tag.startswith('p'):
+            for p_el in el:
+                if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
+                    ind = str(svala_i)
+
+                    target_id = "t" + ind
+                    target_edge_ids.append(target_id)
+
+                    add_error_token(p_el, target, sentence_string_target_id, target_i, target_ids)
+
+                    target_i += 1
+                    svala_i += 1
+
+                elif p_el.tag.startswith('c'):
+                    target[-1]['space_after'] = True
+
+        elif el.tag.startswith('u2'):
+            for el_l2 in el:
+                if el_l2.tag.startswith('w') or el_l2.tag.startswith('pc'):
+                    ind = str(svala_i)
+
+                    source_id = "s" + ind
+                    source_edge_ids.append(source_id)
+
+                    add_error_token(el_l2, source, sentence_string_source_id, source_i, source_ids)
+
+                    source_i += 1
+                    svala_i += 1
+
+                elif el_l2.tag.startswith('c'):
+                    source[-1]['space_after'] = True
+
+                elif el_l2.tag.startswith('u3'):
+                    for el_l3 in el_l2:
+                        if el_l3.tag.startswith('w') or el_l3.tag.startswith('pc'):
+                            ind = str(svala_i)
+
+                            source_id = "s" + ind
+                            source_edge_ids.append(source_id)
+
+                            add_error_token(el_l3, source, sentence_string_source_id, source_i, source_ids)
+
+                            source_i += 1
+                            svala_i += 1
+
+                        elif el_l3.tag.startswith('c'):
+                            source[-1]['space_after'] = True
+
+                        elif el_l3.tag.startswith('u4'):
+                            for el_l4 in el_l3:
+                                if el_l4.tag.startswith('w') or el_l4.tag.startswith('pc'):
+                                    ind = str(svala_i)
+
+                                    source_id = "s" + ind
+                                    source_edge_ids.append(source_id)
+
+                                    add_error_token(el_l4, source, sentence_string_source_id, source_i, source_ids)
+
+                                    source_i += 1
+                                    svala_i += 1
+                                elif el_l4.tag.startswith('c'):
+                                    source[-1]['space_after'] = True
+
+                                elif el_l4.tag.startswith('u5'):
+                                    for el_l5 in el_l4:
+                                        if el_l5.tag.startswith('w') or el_l5.tag.startswith('pc'):
+                                            ind = str(svala_i)
+
+                                            source_id = "s" + ind
+                                            source_edge_ids.append(source_id)
+
+                                            add_error_token(el_l5, source, sentence_string_source_id, source_i, source_ids)
+
+                                            source_i += 1
+                                            svala_i += 1
+                                        elif el_l5.tag.startswith('c'):
+                                            source[-1]['space_after'] = True
+
+            for p_el in el:
+                if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
+                    ind = str(svala_i)
+
+                    target_id = "t" + ind
+                    target_edge_ids.append(target_id)
+
+                    add_error_token(p_el, target, sentence_string_target_id, target_i, target_ids)
+
+                    target_i += 1
+                    svala_i += 1
+                elif p_el.tag.startswith('c'):
+                    target[-1]['space_after'] = True
+
+
+    edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids)
+    edge_id = "e-" + "-".join(edge_ids)
+    edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': svala_data['edges'][edge_id]['labels']})
+
+    return svala_i, source_i, target_i
+
+
+def process_file(et, args):
+    if os.path.exists(args.results_folder):
+        shutil.rmtree(args.results_folder)
+    os.mkdir(args.results_folder)
+    for div in et.iter('div'):
+        bibl = div.find('bibl')
+        file_name = bibl.get('n')
+        file_name = file_name.replace('/', '_')
+
+        svala_path = os.path.join(args.svala_folder, file_name)
+        # skip files that are not svala annotated (to enable short examples)
+        if not os.path.isdir(svala_path):
+            continue
+
+        svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)]
+        svala_dict = {e[0]: e[1] for e in svala_list}
+
+        paragraphs = div.findall('p')
+        for paragraph in paragraphs:
+            sentences = paragraph.findall('s')
+            svala_i = 1
+
+
+
+            # read json
+            svala_file = os.path.join(svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']])
+            jf = open(svala_file)
+            svala_data = json.load(jf)
+            jf.close()
+
+            etree_source_sentences = []
+            etree_target_sentences = []
+            edges = []
+            for sentence_id, sentence in enumerate(sentences):
+                source = []
+                target = []
+
+                sentence_id += 1
+                source_i = 1
+                target_i = 1
+                sentence_string_source_id = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + f's.{sentence_id}'
+                sentence_string_target_id = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + f't.{sentence_id}'
+                for el in sentence:
+                    if el.tag.startswith('w'):
+                        add_token(str(svala_i), source_i, target_i, el, source, target, edges, svala_data, sentence_string_source_id, sentence_string_target_id)
+                        svala_i += 1
+                        source_i += 1
+                        target_i += 1
+                    elif el.tag.startswith('pc'):
+                        add_token(str(svala_i), source_i, target_i, el, source, target, edges, svala_data, sentence_string_source_id, sentence_string_target_id)
+                        svala_i += 1
+                        source_i += 1
+                        target_i += 1
+                    elif el.tag.startswith('u'):
+                        svala_i, source_i, target_i = add_errors(svala_i, source_i, target_i, el, source, target, edges, svala_data, sentence_string_source_id, sentence_string_target_id)
+                    elif el.tag.startswith('c'):
+                        source[-1]['space_after'] = True
+                        target[-1]['space_after'] = True
+
+                etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source))
+                etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target))
+
+            etree_source_paragraph = construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1] + 's', etree_source_sentences)
+            etree_source_document = TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], [etree_source_paragraph])
+            etree_source = build_tei_etrees([etree_source_document])
+
+            etree_target_paragraph = construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1] + 't', etree_target_sentences)
+            etree_target_document = TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], [etree_target_paragraph])
+            etree_target = build_tei_etrees([etree_target_document])
+
+            with open(os.path.join(args.results_folder, f"{paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']}_source"), 'w') as sf:
+                sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())
+
+            with open(os.path.join(args.results_folder, f"{paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']}_target"), 'w') as tf:
+                tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())
+
+            with open(os.path.join(args.results_folder, f"{paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']}_errors"), 'w') as jf:
+                json.dump(edges, jf, ensure_ascii=False, indent="  ")
+
+        break
+
+
+def main(args):
+    with open(args.solar_file, 'r') as fp:
+        logging.info(args.solar_file)
+        et = ElementTree.XML(fp.read())
+        process_file(et, args)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
+    parser.add_argument('--solar_file', default='data/Solar2.0/solar2.xml',
+                        help='input file in (gz or xml currently). If none, then just database is loaded')
+    parser.add_argument('--txt_file', default='data/txt/input',
+                        help='input file in (gz or xml currently). If none, then just database is loaded')
+    parser.add_argument('--svala_folder', default='data/solar.svala.error.small',
+                        help='input file in (gz or xml currently). If none, then just database is loaded')
+    parser.add_argument('--results_folder', default='data/results/solar3.0',
+                        help='input file in (gz or xml currently). If none, then just database is loaded')
+    args = parser.parse_args()
+
+    start = time.time()
+    main(args)
+    logging.info("TIME: {}".format(time.time() - start))