diff --git a/conversion_utils/conllu_to_tei.py b/conversion_utils/conllu_to_tei.py index acd3520..3a9c52d 100644 --- a/conversion_utils/conllu_to_tei.py +++ b/conversion_utils/conllu_to_tei.py @@ -4,6 +4,14 @@ import sys from lxml import etree +from conversion_utils.jos_msds_and_properties import Converter, Msd + +converter = Converter() +def translate_msd(msd_text, lang, lemma=None): + """ Translates msd using conversion_utils library. """ + return converter.properties_to_msd(converter.msd_to_properties(Msd(msd_text, 'en'), 'sl', lemma), + 'sl').code + class Sentence: def __init__(self, _id, no_ud=False, system='jos'): self._id = _id @@ -58,6 +66,7 @@ class Sentence: to_add = etree.Element('w') to_add.set('lemma', lemma) + xpos = translate_msd(xpos,'sl',lemma) to_add.set('ana', 'mte:' + xpos) if not self.no_ud: if upos_other != '_': @@ -137,35 +146,28 @@ class Paragraph: class TeiDocument: - def __init__(self, _id, paragraphs=list()): + def __init__(self, _id, paragraphs=list(), metadata=None): self._id = _id + self.metadata = metadata self.paragraphs = paragraphs def as_xml(self): - root = etree.Element('TEI') - root.set('xmlns', 'http://www.tei-c.org/ns/1.0') - set_xml_attr(root, 'lang', 'sl') + root = etree.Element('div') xml_id = self._id if xml_id is not None: set_xml_attr(root, 'id', xml_id) - - tei_header = etree.SubElement(root, 'teiHeader') - text = etree.SubElement(root, 'text') - body = etree.SubElement(text, 'body') + bibl = etree.Element('bibl') + bibl.set('corresp', f'#{xml_id}') + bibl.set('n', f'#{xml_id}') + for k, v in self.metadata.items(): + bibl_el = etree.Element(k) + bibl_el.text = v + bibl.append(bibl_el) + root.append(bibl) for para in self.paragraphs: - body.append(para.as_xml(id_prefix=xml_id)) - - encoding_desc = etree.SubElement(tei_header, 'encodingDesc') - tags_decl = etree.SubElement(encoding_desc, 'tagsDecl') - namespace = etree.SubElement(tags_decl, 'namespace') - namespace.set('name', 'http://www.tei-c.org/ns/1.0') - for tag in ['p', 's', 'pc', 'w']: - count = int(text.xpath('count(.//{})'.format(tag))) - tag_usage = etree.SubElement(namespace, 'tagUsage') - tag_usage.set('gi', tag) - tag_usage.set('occurs', str(count)) + root.append(para.as_xml(id_prefix=xml_id)) return root def add_paragraph(self, paragraph): @@ -173,10 +175,13 @@ class TeiDocument: def build_tei_etrees(documents): - elements = [] + root = etree.Element('body') + root.set('xmlns', 'http://www.tei-c.org/ns/1.0') + set_xml_attr(root, 'base', 'korpus.xml') + set_xml_attr(root, 'lang', 'sl') for document in documents: - elements.append(document.as_xml()) - return elements + root.append(document.as_xml()) + return root def set_xml_attr(node, attribute, value): @@ -199,11 +204,12 @@ def is_metaline(line): return False -def construct_tei_documents(conllu_lines): +def construct_tei_documents(conllu_lines, metadata): documents = [] doc_id = None - document_paragraphs = [] + doc_id_num = 0 + document_paragraphs = [] para_id = None para_buffer = [] @@ -215,9 +221,12 @@ def construct_tei_documents(conllu_lines): if len(para_buffer) > 0: document_paragraphs.append(construct_paragraph(para_id, para_buffer)) if len(document_paragraphs) > 0: + print(metadata) + print(doc_id_num) documents.append( - TeiDocument(doc_id, document_paragraphs)) + TeiDocument(doc_id, document_paragraphs, metadata[doc_id_num])) document_paragraphs = [] + doc_id_num += 1 doc_id = val elif key == 'newpar id': if len(para_buffer) > 0: @@ -235,7 +244,8 @@ def construct_tei_documents(conllu_lines): if len(document_paragraphs) > 0: documents.append( - TeiDocument(doc_id, document_paragraphs)) + TeiDocument(doc_id, document_paragraphs, metadata[doc_id_num])) + doc_id_num += 1 return documents @@ -299,14 +309,14 @@ def construct_sentence(sent_id, lines): return sentence -def construct_tei_etrees(conllu_lines): - documents = construct_tei_documents(conllu_lines) +def construct_tei_etrees(conllu_lines, metadata): + documents = construct_tei_documents(conllu_lines, metadata) return build_tei_etrees(documents) -def convert_file(input_file_name, output_file_name): +def convert_file(input_file_name, output_file_name, metadata): input_file = open(input_file_name, 'r') - root = construct_tei_etrees(input_file)[0] + root = construct_tei_etrees(input_file, metadata) tree = etree.ElementTree(root) tree.write(output_file_name, encoding='UTF-8', pretty_print=True) input_file.close() diff --git a/conversion_utils/jos_msds_and_properties.py b/conversion_utils/jos_msds_and_properties.py index d6fbcd5..4f2fa81 100644 --- a/conversion_utils/jos_msds_and_properties.py +++ b/conversion_utils/jos_msds_and_properties.py @@ -261,8 +261,8 @@ class Converter: level information. """ - if (msd.code not in self.specifications.codes_map[msd.language]): - raise ConverterException('The msd {} is unknown'.format(msd.code)) + # if (msd.code not in self.specifications.codes_map[msd.language]): + # raise ConverterException('The msd {} is unknown'.format(msd.code)) category_char = msd.code[0].lower() value_chars = msd.code[1:] diff --git a/run.py b/run.py new file mode 100644 index 0000000..403206c --- /dev/null +++ b/run.py @@ -0,0 +1,59 @@ +import os + +from conversion_utils.conllu_to_tei import convert_file +import csv +# dir_path = 'data/conllu' +# out_dir_path = 'data/tei' +# for filename in os.listdir(dir_path): +# in_name = os.path.join(dir_path, filename) +# out_filename = filename.split('.')[:-1] +# out_filename = '.'.join(out_filename) + '.xml' +# out_name = os.path.join(out_dir_path, out_filename) +# convert_file(in_name, out_name) + +metadata_list = [] +with open('data/metadata.csv', newline='') as csvfile: + for line in csv.reader(csvfile): + metadata_list.append(line) + +metadata = [{} for i in range(len(metadata_list[0]) - 1)] +for i in range(1, len(metadata_list[0])): + metadata[i - 1]['title'] = metadata_list[0][i] + metadata[i - 1]['subtitle'] = metadata_list[1][i] + metadata[i - 1]['authors'] = metadata_list[2][i] + metadata[i - 1]['first_edition'] = metadata_list[3][i] + metadata[i - 1]['edition_in_corpus'] = metadata_list[4][i] + metadata[i - 1]['layer_according_to_SEJO'] = metadata_list[5][i] + metadata[i - 1]['audience'] = metadata_list[6][i] + metadata[i - 1]['hours_of_classes'] = metadata_list[7][i] + metadata[i - 1]['publisher'] = metadata_list[8][i] + metadata[i - 1]['file_name'] = metadata_list[9][i] + + +dir_path = 'data/conllu' +out_path = 'data/tei/tei.xml' +out_dir = 'data/conllu.conllu' +# out_dir = dir_path + '/conllu_small.conllu' +metadata_indices = [6, 16, 13, 7, 2, 1, 3, 14, 15, 0, 8, 4, 11, 9, 12, 5, 10] +out_file = open(out_dir, 'w') +metadata_indices = [] +for fn_i, filename in enumerate(os.listdir(dir_path)): + in_name = os.path.join(dir_path, filename) + out_filename = filename.split('.')[:-1] + out_filename = '.'.join(out_filename) + for m_i, el in enumerate(metadata): + if el['file_name'] == out_filename: + metadata_indices.append(m_i) + out_filename = out_filename + '.xml' + out_name = os.path.join(out_dir, out_filename) + in_file = open(in_name, 'r') + data = f'# newdoc id = doc{str(fn_i+1)}\n' + data += in_file.read() + in_file.close() + out_file.write(data) +out_file.close() + +shuffled_metadata = [metadata[el] for el in metadata_indices] + + +convert_file(out_dir, out_path, shuffled_metadata)