luscenje_struktur/luscenje_struktur/loader.py

import os
from xml.etree import ElementTree
import logging
import re
import sys
import gzip
import pathlib
from io import StringIO

from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word


def is_root_id(id_):
    return len(id_.split('.')) == 3


def load_files(args, database, w_collection=None, input_corpus=None):
    filenames = input_corpus if input_corpus is not None else args.input
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate

    if len(filenames) == 1 and os.path.isdir(filenames[0]):
        filenames = [os.path.join(filenames[0], file) for file in os.listdir(filenames[0]) if file[-5:] != '.zstd']

    if len(filenames) > 1:
        filenames = [filename for filename in filenames if filename[-5:] != '.zstd']
        filenames = sorted(filenames, key=lambda x: int(x.split('.')[-1]))

    database.init("CREATE TABLE Files ( filename varchar(2048) )")

    for idx, fname in enumerate(filenames):
        logging.info("FILE " + fname + "{}/{}".format(idx, len(filenames)))
        extension = pathlib.Path(fname).suffix

        # check if file with the same name already loaded...
        loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
        if loaded is not None:
            logging.info("ALREADY LOADED")
            continue

        if extension == ".xml":
            et = load_xml(fname)
            if input_corpus is None:
                yield file_sentence_generator(et, args)
            else:
                sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
                for sent_id, sentence, othr_attributes in sentence_generator:
                    yield sent_id, sentence, othr_attributes
        elif extension == ".gz":
            if input_corpus is None:
                yield load_csv(fname, True)
            else:
                sentences = load_csv_valency(fname, True, w_collection)
                for sentence in sentences:
                    yield sentence
        elif extension == ".conllu":
            if input_corpus is None:
                yield load_conllu(fname)
            else:
                raise Exception('conllu with input_corpus is not supported!')
        else:
            if input_corpus is None:
                yield load_csv(fname, False)
            else:
                sentences = load_csv_valency(fname, False, w_collection)
                for sentence in sentences:
                    yield sentence

        database.execute("INSERT INTO Files (filename) VALUES (?)", (fname,))
        database.commit()


def lines_gz(filename):
    with gzip.open(filename, 'r') as fp:
        for line in progress(fp, 'load-gz'):
            yield line.decode('utf8')


def lines_csv(filename):
    with open(filename, 'r') as fp:
        for line in progress(fp, 'load-csv'):
            yield line


def load_conllu(filename):
    import conllu
    result = []
    bad_sentence = False

    words = {}
    links = []

    def sentence_end(bad_sentence, sent_id):
        if bad_sentence:
            return

        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + sent_id)
                continue
            words[lfrom].add_link(ana, words[ldest])
        result.extend(words.values())

    with open(filename, 'r') as f:
        data = f.read()
        # conlls = conllu.parse_incr(StringIO(data))
        # for sent in conlls:
        #     try:
        #         for word in sent:
        #             full_id = "{}.{}".format(sent.metadata['sent_id'], str(word['id']))
        #             words[str(word['id'])] = Word(word['id'], word['xpos'], full_id, word['form'], False)
        #     except:
        #         logging.error(f"Error while reading file {filename} in sentence {sent.metadata['sent_id']}. Check if required data is available!")

        conlls = conllu.parse_incr(StringIO(data))
        # build dep parse
        for sent in conlls:
            try:
                # adding fake word
                words['0'] = Word('', '', '0', '', False, True)
                for word in sent:
                    if type(word['id']) == tuple:
                        continue
                    full_id = "{}.{}".format(sent.metadata['sent_id'], str(word['id']))
                    words[str(word['id'])] = Word(word['lemma'], word['upos'], full_id, word['form'], False)
                    links.append((str(word['head']), str(word['id']), word['deprel']))
                sentence_end(False, sent.metadata['sent_id'])
                links = []
                words = {}
            except:
                links = []
                words = {}
                logging.error(f"Error while reading file {filename} in sentence {sent.metadata['sent_id']}. Check if required data is available!")

    return result


def load_csv(filename, compressed):
    result = []
    bad_sentence = False

    words = {}
    links = []

    def sentence_end(bad_sentence):
        if bad_sentence:
            return

        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + line_split[0])
                continue
            words[lfrom].add_link(ana, words[ldest])
        result.extend(words.values())

    line_gen = lines_gz if compressed else lines_csv
    for line in line_gen(filename):
        line_str = line.strip()
        line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
        line_split = line_fixed.split("\t")

        if line_split[1] == "1" and len(words) > 0:
            # adding fake word
            words['0'] = Word('', '', '0', '', False, True)
            sentence_end(bad_sentence)
            bad_sentence = False
            links = []
            words = {}

        try:
            sid, wid, text, msd, lemma, link_src, link_type = line_split
        except ValueError:
            bad_sentence = True
        full_id = "{}.{}".format(sid, wid)

        words[wid] = Word(lemma, msd, full_id, text, True)
        # if link_src != '0':
        links.append((link_src, wid, link_type))

    # adding fake word
    words['0'] = Word('', '', '0', '', False, True)
    sentence_end(bad_sentence)
    return result


def load_csv_valency(filename, compressed, w_collection):
    # TODO skip sentences that are not in sentences of interest!!!
    result = {}
    bad_sentence = False

    words = {}
    links = []
    idi = 0

    def sentence_end(bad_sentence, sid):
        if bad_sentence:
            return

        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + line_split[0])
                continue
            words[lfrom].add_link(ana, words[ldest])
        result[sid] = list(words.values())

    line_gen = lines_gz if compressed else lines_csv
    for line in line_gen(filename):
        line_str = line.strip()
        line_fixed = line_str.replace('\t\t\t', '\t,\t')
        line_split = line_fixed.split("\t")

        if line_split[1] == "1" and len(words) > 0:
            sentence_end(bad_sentence, sid)
            bad_sentence = False
            links = []
            words = {}
            idi = 0

        try:
            sid, wid, text, msd, lemma, link_src, link_type = line_split
        except ValueError:
            bad_sentence = True
        full_id = "{}.{}".format(sid, wid)

        words[wid] = Word(lemma, msd, full_id, text, True)
        if not (len(text[0]) == 1 and re.match('^[\w]+$', text[0]) is None):
            words[wid].idi = str(idi)
            idi += 1

        if link_src != '0':
            links.append((link_src, wid, link_type))

    sentence_end(bad_sentence, sid)

    sentence_ids = list(result.keys())
    cur = w_collection.find({'_id': {'$in': sentence_ids}})
    cur = [c for c in cur]
    unsorted_result = [(c['_id'], result[c['_id']], {k: v for k, v in c.items() if k != '_id'}) for c in cur]
    return sorted(unsorted_result, key=lambda x: (x[0].split('.')[0], int(x[0].split('.')[1]), int(x[0].split('.')[2])))

def load_xml(filename):
    with open(filename, 'r') as fp:
        content = fp.read()

    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
    xmlstring = xmlstring.replace(' xml:', ' ')
    return ElementTree.XML(xmlstring)


def file_sentence_generator(et, args):
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate
    pc_tag = args.pc_tag
    use_punctuations = not args.ignore_punctuations
    previous_pc = False

    words = {}
    paragraphs = list(et.iter('p'))
    for paragraph in progress(paragraphs, "load-text"):
        previous_glue = ''
        sentences = list(paragraph.iter('s'))
        for sentence in sentences:
            # create fake root word
            words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
            last_word_id = None

            if args.new_tei:
                for w in sentence.iter():
                    if w.tag == 'w':
                        words[w.get('id')] = Word.from_xml(w, do_msd_translate)
                        if use_punctuations:
                            previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
                    elif w.tag == pc_tag:
                        words[w.get('id')] = Word.pc_word(w, do_msd_translate)
                        if use_punctuations:
                            words[w.get('id')].previous_glue = previous_glue
                            words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
                            previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
            else:
                for w in sentence.iter():
                    if w.tag == 'w':
                        words[w.get('id')] = Word.from_xml(w, do_msd_translate)
                        if use_punctuations:
                            previous_glue = ''
                            last_word_id = None
                    elif w.tag == pc_tag:
                        words[w.get('id')] = Word.pc_word(w, do_msd_translate)
                        if use_punctuations:
                            last_word_id = w.get('id')
                            words[w.get('id')].previous_glue = previous_glue
                            previous_glue = ''
                    elif use_punctuations and w.tag == 'c':
                        # always save previous glue
                        previous_glue = w.text
                        if last_word_id:
                            words[last_word_id].glue += w.text

            for l in sentence.iter("link"):
                if 'dep' in l.keys():
                    ana = l.get('afun')
                    lfrom = l.get('from')
                    dest = l.get('dep')
                else:
                    ana = l.get('ana')
                    if ana[:8] != 'jos-syn:': # dont bother...
                        continue
                    ana = ana[8:]
                    lfrom, dest = l.get('target').replace('#', '').split()

                if lfrom in words:
                    if not skip_id_check and is_root_id(lfrom):
                        logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom))
                        sys.exit(1)

                    if dest in words:
                        next_word = words[dest]
                        words[lfrom].add_link(ana, next_word)
                    else:
                        logging.error("Unknown id: {}".format(dest))
                        sys.exit(1)

                else:
                    # strange errors, just skip...
                    pass

    a = list(words.values())
    return list(words.values())


def file_sentence_generator_valency(et, skip_id_check, do_msd_translate, pc_tag, w_collection):
    words = {}
    sentences = list(et.iter('s'))
    sentence_ids = [s.attrib['id'] for s in sentences]
    cur = w_collection.find({'_id': {'$in': sentence_ids}})
    sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}

    for sentence in progress(sentences, "load-text"):
        if sentence.attrib['id'] not in sentences_of_interest:
            continue
        idi = 0
        last_word_id = None
        for w in sentence.iter():
            if w.tag == 'w':
                last_word_id = w.get('id')
                words[last_word_id] = Word.from_xml(w, do_msd_translate)
                words[last_word_id].idi = str(idi)
                idi += 1
            elif w.tag == pc_tag:
                last_word_id = w.get('id')
                words[last_word_id] = Word.pc_word(w, do_msd_translate)
            elif w.tag == 'c':
                if last_word_id:
                    words[last_word_id].glue += w.text

        for l in sentence.iter("link"):
            if 'dep' in l.keys():
                ana = l.get('afun')
                lfrom = l.get('from')
                dest = l.get('dep')
            else:
                ana = l.get('ana')
                if ana[:8] != 'jos-syn:':  # dont bother...
                    continue
                ana = ana[8:]
                lfrom, dest = l.get('target').replace('#', '').split()

            if lfrom in words:
                if not skip_id_check and is_root_id(lfrom):
                    logging.error("NOO: {}".format(lfrom))
                    sys.exit(1)

                if dest in words:
                    next_word = words[dest]
                    words[lfrom].add_link(ana, next_word)
                else:
                    logging.error("Unknown id: {}".format(dest))
                    sys.exit(1)

            else:
                # strange errors, just skip...
                pass
        yield sentence.attrib['id'], list(words.values()), sentences_of_interest[sentence.attrib['id']]
        words = {}


def file_sentence_glue_generator(files, pc_tag, w_collection):
    for fname in files:
        et = load_xml(fname)

        words = {}
        sentences = list(et.iter('s'))

        sentence_ids = [s.attrib['id'] for s in sentences]
        cur = w_collection.find({'_id': {'$in': sentence_ids}})
        sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}

        for sentence in progress(sentences, "load-text"):
            if sentence.attrib['id'] not in sentences_of_interest:
                continue
            w_id = 1
            last_word_id = None
            sentence_id = None
            for w in sentence.iter():
                if w.tag == 'w':
                    last_word_id = w_id
                    words[last_word_id] = [w.text, last_word_id, '']
                    w_id += 1
                elif w.tag == pc_tag:
                    last_word_id = w_id
                    words[last_word_id] = [w.text, last_word_id, '']
                    w_id += 1
                elif w.tag == 'c':
                    if last_word_id:
                        words[last_word_id][2] += w.text
                elif w.tag == 's':
                    sentence_id = w.attrib['id']

            yield (sentence_id, list(words.values()))
            words = {}