luscenje_struktur/src/loader.py

from xml.etree import ElementTree
import logging
import re
import sys
import gzip
import pathlib

from progress_bar import progress
from word import Word


def is_root_id(id_):
    return len(id_.split('.')) == 3


def load_files(args, database):
    filenames = args.input
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate

    database.init("CREATE TABLE Files ( filename varchar(2048) )")

    for idx, fname in enumerate(filenames):
        print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
        extension = pathlib.Path(fname).suffix

        # check if file with the same name already loaded...
        loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()
        if loaded is not None:
            print("ALREADY LOADED")
            continue

        if extension == ".xml":
            et = load_xml(fname)
            yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
        elif extension == ".gz":
            yield load_csv(fname, True)
        else:
            yield load_csv(fname, False)
        # else:
        #     raise NotImplementedError("Unknown file extension: {}".format(extension))

        database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))
        database.commit()


def lines_gz(filename):
    with gzip.open(filename, 'r') as fp:
        for line in progress(fp, 'load-gz'):
            yield line.decode('utf8')


def lines_csv(filename):
    with open(filename, 'r') as fp:
        for line in progress(fp, 'load-csv'):
            yield line


def load_csv(filename, compressed):
    result = []
    bad_sentence = False

    words = {}
    links = []

    def sentence_end(bad_sentence):
        if bad_sentence:
            return

        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + line_split[0])
                continue
            words[lfrom].add_link(ana, words[ldest])
        result.extend(words.values())

    line_gen = lines_gz if compressed else lines_csv
    for line in line_gen(filename):
        line_str = line.strip()
        line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
        line_split = line_fixed.split("\t")

        if line_split[1] == "1" and len(words) > 0:
            sentence_end(bad_sentence)
            bad_sentence = False
            links = []
            words = {}

        try:
            sid, wid, text, msd, lemma, link_src, link_type = line_split
        except ValueError:
            bad_sentence = True
        full_id = "{}.{}".format(sid, wid)
        
        words[wid] = Word(lemma, msd, full_id, text, True)
        if link_src != '0':
            links.append((link_src, wid, link_type))
    
    sentence_end(bad_sentence)
    return result

def load_xml(filename):
    with open(filename, 'r') as fp:
        content = fp.read()

    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
    xmlstring = xmlstring.replace(' xml:', ' ')
    return ElementTree.XML(xmlstring)


def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
    words = {}
    sentences = list(et.iter('s'))
    for sentence in progress(sentences, "load-text"):
        for w in sentence.iter("w"):
            words[w.get('id')] = Word.from_xml(w, do_msd_translate)
        for pc in sentence.iter(pc_tag):
            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)

        for l in sentence.iter("link"):
            if 'dep' in l.keys():
                ana = l.get('afun')
                lfrom = l.get('from')
                dest = l.get('dep')
            else:
                ana = l.get('ana')
                if ana[:4] != 'syn:': # dont bother...
                    continue
                ana = ana[4:]
                lfrom, dest = l.get('target').replace('#', '').split()

            if lfrom in words:
                if not skip_id_check and is_root_id(lfrom):
                    logging.error("NOO: {}".format(lfrom))
                    sys.exit(1)

                if dest in words:
                    next_word = words[dest]
                    words[lfrom].add_link(ana, next_word)
                else:
                    logging.error("Unknown id: {}".format(dest))
                    sys.exit(1)

            else:
                # strange errors, just skip...
                pass

    return list(words.values())
Loader to its own module 2019-06-17 13:38:55 +00:00			`from xml.etree import ElementTree`
			`import logging`
			`import re`
			`import sys`
Adding ability to load gz files. 2019-06-17 18:41:11 +00:00			`import gzip`
			`import pathlib`
Loader to its own module 2019-06-17 13:38:55 +00:00
New progress bar 2019-06-17 15:30:51 +00:00			`from progress_bar import progress`
Loader to its own module 2019-06-17 13:38:55 +00:00			`from word import Word`


			`def is_root_id(id_):`
			`return len(id_.split('.')) == 3`


files loaded now in database 2019-08-21 09:12:38 +00:00			`def load_files(args, database):`
Loader to its own module 2019-06-17 13:38:55 +00:00			`filenames = args.input`
			`skip_id_check = args.skip_id_check`
			`do_msd_translate = not args.no_msd_translate`

files loaded now in database 2019-08-21 09:12:38 +00:00			`database.init("CREATE TABLE Files ( filename varchar(2048) )")`

simplifying progress, because I will remove the parallel stuff 2019-07-03 08:23:18 +00:00			`for idx, fname in enumerate(filenames):`
			`print("FILE ", fname, "{}/{}".format(idx, len(filenames)))`
Adding ability to load gz files. 2019-06-17 18:41:11 +00:00			`extension = pathlib.Path(fname).suffix`
Loader to its own module 2019-06-17 13:38:55 +00:00
files loaded now in database 2019-08-21 09:12:38 +00:00			`# check if file with the same name already loaded...`
			`loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()`
			`if loaded is not None:`
			`print("ALREADY LOADED")`
			`continue`

Adding ability to load gz files. 2019-06-17 18:41:11 +00:00			`if extension == ".xml":`
			`et = load_xml(fname)`
			`yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)`
			`elif extension == ".gz":`
can now load csv files 2019-08-21 09:09:47 +00:00			`yield load_csv(fname, True)`
Adding ability to load gz files. 2019-06-17 18:41:11 +00:00			`else:`
can now load csv files 2019-08-21 09:09:47 +00:00			`yield load_csv(fname, False)`
			`# else:`
			`# raise NotImplementedError("Unknown file extension: {}".format(extension))`

files loaded now in database 2019-08-21 09:12:38 +00:00			`database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))`
			`database.commit()`


can now load csv files 2019-08-21 09:09:47 +00:00			`def lines_gz(filename):`
			`with gzip.open(filename, 'r') as fp:`
			`for line in progress(fp, 'load-gz'):`
			`yield line.decode('utf8')`


			`def lines_csv(filename):`
			`with open(filename, 'r') as fp:`
			`for line in progress(fp, 'load-csv'):`
			`yield line`
Adding ability to load gz files. 2019-06-17 18:41:11 +00:00

can now load csv files 2019-08-21 09:09:47 +00:00			`def load_csv(filename, compressed):`
Adding ability to load gz files. 2019-06-17 18:41:11 +00:00			`result = []`
			`bad_sentence = False`

Fixing loading bad gz files and progress showing 2019-06-26 11:06:43 +00:00			`words = {}`
			`links = []`

			`def sentence_end(bad_sentence):`
			`if bad_sentence:`
			`return`

			`for lfrom, ldest, ana in links:`
			`if lfrom not in words or ldest not in words:`
			`logging.warning("Bad link in sentence: " + line_split[0])`
			`continue`
			`words[lfrom].add_link(ana, words[ldest])`
			`result.extend(words.values())`

can now load csv files 2019-08-21 09:09:47 +00:00			`line_gen = lines_gz if compressed else lines_csv`
			`for line in line_gen(filename):`
			`line_str = line.strip()`
			`line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')`
			`line_split = line_fixed.split("\t")`

			`if line_split[1] == "1" and len(words) > 0:`
			`sentence_end(bad_sentence)`
			`bad_sentence = False`
			`links = []`
			`words = {}`

			`try:`
			`sid, wid, text, msd, lemma, link_src, link_type = line_split`
			`except ValueError:`
			`bad_sentence = True`
			`full_id = "{}.{}".format(sid, wid)`

			`words[wid] = Word(lemma, msd, full_id, text, True)`
			`if link_src != '0':`
			`links.append((link_src, wid, link_type))`
Adding ability to load gz files. 2019-06-17 18:41:11 +00:00
Fixing loading bad gz files and progress showing 2019-06-26 11:06:43 +00:00			`sentence_end(bad_sentence)`
Adding ability to load gz files. 2019-06-17 18:41:11 +00:00			`return result`
Loader to its own module 2019-06-17 13:38:55 +00:00
			`def load_xml(filename):`
			`with open(filename, 'r') as fp:`
			`content = fp.read()`

			`xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)`
			`xmlstring = xmlstring.replace(' xml:', ' ')`
			`return ElementTree.XML(xmlstring)`


			`def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):`
			`words = {}`
			`sentences = list(et.iter('s'))`
simplifying progress, because I will remove the parallel stuff 2019-07-03 08:23:18 +00:00			`for sentence in progress(sentences, "load-text"):`
Loader to its own module 2019-06-17 13:38:55 +00:00			`for w in sentence.iter("w"):`
Adding ability to load gz files. 2019-06-17 18:41:11 +00:00			`words[w.get('id')] = Word.from_xml(w, do_msd_translate)`
Loader to its own module 2019-06-17 13:38:55 +00:00			`for pc in sentence.iter(pc_tag):`
			`words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)`

			`for l in sentence.iter("link"):`
			`if 'dep' in l.keys():`
			`ana = l.get('afun')`
			`lfrom = l.get('from')`
			`dest = l.get('dep')`
			`else:`
			`ana = l.get('ana')`
			`if ana[:4] != 'syn:': # dont bother...`
			`continue`
			`ana = ana[4:]`
			`lfrom, dest = l.get('target').replace('#', '').split()`

			`if lfrom in words:`
			`if not skip_id_check and is_root_id(lfrom):`
			`logging.error("NOO: {}".format(lfrom))`
			`sys.exit(1)`

			`if dest in words:`
			`next_word = words[dest]`
			`words[lfrom].add_link(ana, next_word)`
			`else:`
			`logging.error("Unknown id: {}".format(dest))`
			`sys.exit(1)`

			`else:`
			`# strange errors, just skip...`
			`pass`

Fixing loading bad gz files and progress showing 2019-06-26 11:06:43 +00:00			`return list(words.values())`