import os from xml.etree import ElementTree import logging import re import sys import gzip import pathlib from io import StringIO from luscenje_struktur.progress_bar import progress from luscenje_struktur.word import Word def is_root_id(id_): return len(id_.split('.')) == 3 def load_files(args, database, w_collection=None, input_corpus=None): filenames = input_corpus if input_corpus is not None else args.input skip_id_check = args.skip_id_check do_msd_translate = not args.no_msd_translate if len(filenames) == 1 and os.path.isdir(filenames[0]): filenames = [os.path.join(filenames[0], file) for file in os.listdir(filenames[0]) if file[-5:] != '.zstd'] if len(filenames) > 1: filenames = [filename for filename in filenames if filename[-5:] != '.zstd'] filenames = sorted(filenames, key=lambda x: int(x.split('.')[-1])) database.init("CREATE TABLE Files ( filename varchar(2048) )") for idx, fname in enumerate(filenames): logging.info("FILE " + fname + "{}/{}".format(idx, len(filenames))) extension = pathlib.Path(fname).suffix # check if file with the same name already loaded... loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone() if loaded is not None: logging.info("ALREADY LOADED") continue if extension == ".xml": et = load_xml(fname) if input_corpus is None: yield file_sentence_generator(et, args) else: sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection) for sent_id, sentence, othr_attributes in sentence_generator: yield sent_id, sentence, othr_attributes elif extension == ".gz": if input_corpus is None: yield load_csv(fname, True) else: sentences = load_csv_valency(fname, True, w_collection) for sentence in sentences: yield sentence elif extension == ".conllu": if input_corpus is None: yield load_conllu(fname) else: raise Exception('conllu with input_corpus is not supported!') else: if input_corpus is None: yield load_csv(fname, False) else: sentences = load_csv_valency(fname, False, w_collection) for sentence in sentences: yield sentence database.execute("INSERT INTO Files (filename) VALUES (?)", (fname,)) database.commit() def lines_gz(filename): with gzip.open(filename, 'r') as fp: for line in progress(fp, 'load-gz'): yield line.decode('utf8') def lines_csv(filename): with open(filename, 'r') as fp: for line in progress(fp, 'load-csv'): yield line def load_conllu(filename): import conllu result = [] bad_sentence = False words = {} links = [] def sentence_end(bad_sentence, sent_id): if bad_sentence: return for lfrom, ldest, ana in links: if lfrom not in words or ldest not in words: logging.warning("Bad link in sentence: " + sent_id) continue words[lfrom].add_link(ana, words[ldest]) result.extend(words.values()) with open(filename, 'r') as f: data = f.read() # conlls = conllu.parse_incr(StringIO(data)) # for sent in conlls: # try: # for word in sent: # full_id = "{}.{}".format(sent.metadata['sent_id'], str(word['id'])) # words[str(word['id'])] = Word(word['id'], word['xpos'], full_id, word['form'], False) # except: # logging.error(f"Error while reading file {filename} in sentence {sent.metadata['sent_id']}. Check if required data is available!") conlls = conllu.parse_incr(StringIO(data)) # build dep parse for sent in conlls: try: # adding fake word words['0'] = Word('', '', '0', '', False, True) for word in sent: if type(word['id']) == tuple: continue full_id = "{}.{}".format(sent.metadata['sent_id'], str(word['id'])) words[str(word['id'])] = Word(word['lemma'], word['upos'], full_id, word['form'], False) links.append((str(word['head']), str(word['id']), word['deprel'])) sentence_end(False, sent.metadata['sent_id']) links = [] words = {} except: links = [] words = {} logging.error(f"Error while reading file {filename} in sentence {sent.metadata['sent_id']}. Check if required data is available!") return result def load_csv(filename, compressed): result = [] bad_sentence = False words = {} links = [] def sentence_end(bad_sentence): if bad_sentence: return for lfrom, ldest, ana in links: if lfrom not in words or ldest not in words: logging.warning("Bad link in sentence: " + line_split[0]) continue words[lfrom].add_link(ana, words[ldest]) result.extend(words.values()) line_gen = lines_gz if compressed else lines_csv for line in line_gen(filename): line_str = line.strip() line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t') line_split = line_fixed.split("\t") if line_split[1] == "1" and len(words) > 0: # adding fake word words['0'] = Word('', '', '0', '', False, True) sentence_end(bad_sentence) bad_sentence = False links = [] words = {} try: sid, wid, text, msd, lemma, link_src, link_type = line_split except ValueError: bad_sentence = True full_id = "{}.{}".format(sid, wid) words[wid] = Word(lemma, msd, full_id, text, True) # if link_src != '0': links.append((link_src, wid, link_type)) # adding fake word words['0'] = Word('', '', '0', '', False, True) sentence_end(bad_sentence) return result def load_csv_valency(filename, compressed, w_collection): # TODO skip sentences that are not in sentences of interest!!! result = {} bad_sentence = False words = {} links = [] idi = 0 def sentence_end(bad_sentence, sid): if bad_sentence: return for lfrom, ldest, ana in links: if lfrom not in words or ldest not in words: logging.warning("Bad link in sentence: " + line_split[0]) continue words[lfrom].add_link(ana, words[ldest]) result[sid] = list(words.values()) line_gen = lines_gz if compressed else lines_csv for line in line_gen(filename): line_str = line.strip() line_fixed = line_str.replace('\t\t\t', '\t,\t') line_split = line_fixed.split("\t") if line_split[1] == "1" and len(words) > 0: sentence_end(bad_sentence, sid) bad_sentence = False links = [] words = {} idi = 0 try: sid, wid, text, msd, lemma, link_src, link_type = line_split except ValueError: bad_sentence = True full_id = "{}.{}".format(sid, wid) words[wid] = Word(lemma, msd, full_id, text, True) if not (len(text[0]) == 1 and re.match('^[\w]+$', text[0]) is None): words[wid].idi = str(idi) idi += 1 if link_src != '0': links.append((link_src, wid, link_type)) sentence_end(bad_sentence, sid) sentence_ids = list(result.keys()) cur = w_collection.find({'_id': {'$in': sentence_ids}}) cur = [c for c in cur] unsorted_result = [(c['_id'], result[c['_id']], {k: v for k, v in c.items() if k != '_id'}) for c in cur] return sorted(unsorted_result, key=lambda x: (x[0].split('.')[0], int(x[0].split('.')[1]), int(x[0].split('.')[2]))) def load_xml(filename): with open(filename, 'r') as fp: content = fp.read() xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) xmlstring = xmlstring.replace(' xml:', ' ') return ElementTree.XML(xmlstring) def file_sentence_generator(et, args): skip_id_check = args.skip_id_check do_msd_translate = not args.no_msd_translate pc_tag = args.pc_tag use_punctuations = not args.ignore_punctuations previous_pc = False words = {} paragraphs = list(et.iter('p')) for paragraph in progress(paragraphs, "load-text"): previous_glue = '' sentences = list(paragraph.iter('s')) for sentence in sentences: # create fake root word words[sentence.get('id')] = Word.fake_root_word(sentence.get('id')) last_word_id = None if args.new_tei: for w in sentence.iter(): if w.tag == 'w': words[w.get('id')] = Word.from_xml(w, do_msd_translate) if use_punctuations: previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' elif w.tag == pc_tag: words[w.get('id')] = Word.pc_word(w, do_msd_translate) if use_punctuations: words[w.get('id')].previous_glue = previous_glue words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' else: for w in sentence.iter(): if w.tag == 'w': words[w.get('id')] = Word.from_xml(w, do_msd_translate) if use_punctuations: previous_glue = '' last_word_id = None elif w.tag == pc_tag: words[w.get('id')] = Word.pc_word(w, do_msd_translate) if use_punctuations: last_word_id = w.get('id') words[w.get('id')].previous_glue = previous_glue previous_glue = '' elif use_punctuations and w.tag == 'c': # always save previous glue previous_glue = w.text if last_word_id: words[last_word_id].glue += w.text for l in sentence.iter("link"): if 'dep' in l.keys(): ana = l.get('afun') lfrom = l.get('from') dest = l.get('dep') else: ana = l.get('ana') if ana[:8] != 'jos-syn:': # dont bother... continue ana = ana[8:] lfrom, dest = l.get('target').replace('#', '').split() if lfrom in words: if not skip_id_check and is_root_id(lfrom): logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom)) sys.exit(1) if dest in words: next_word = words[dest] words[lfrom].add_link(ana, next_word) else: logging.error("Unknown id: {}".format(dest)) sys.exit(1) else: # strange errors, just skip... pass a = list(words.values()) return list(words.values()) def file_sentence_generator_valency(et, skip_id_check, do_msd_translate, pc_tag, w_collection): words = {} sentences = list(et.iter('s')) sentence_ids = [s.attrib['id'] for s in sentences] cur = w_collection.find({'_id': {'$in': sentence_ids}}) sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur} for sentence in progress(sentences, "load-text"): if sentence.attrib['id'] not in sentences_of_interest: continue idi = 0 last_word_id = None for w in sentence.iter(): if w.tag == 'w': last_word_id = w.get('id') words[last_word_id] = Word.from_xml(w, do_msd_translate) words[last_word_id].idi = str(idi) idi += 1 elif w.tag == pc_tag: last_word_id = w.get('id') words[last_word_id] = Word.pc_word(w, do_msd_translate) elif w.tag == 'c': if last_word_id: words[last_word_id].glue += w.text for l in sentence.iter("link"): if 'dep' in l.keys(): ana = l.get('afun') lfrom = l.get('from') dest = l.get('dep') else: ana = l.get('ana') if ana[:8] != 'jos-syn:': # dont bother... continue ana = ana[8:] lfrom, dest = l.get('target').replace('#', '').split() if lfrom in words: if not skip_id_check and is_root_id(lfrom): logging.error("NOO: {}".format(lfrom)) sys.exit(1) if dest in words: next_word = words[dest] words[lfrom].add_link(ana, next_word) else: logging.error("Unknown id: {}".format(dest)) sys.exit(1) else: # strange errors, just skip... pass yield sentence.attrib['id'], list(words.values()), sentences_of_interest[sentence.attrib['id']] words = {} def file_sentence_glue_generator(files, pc_tag, w_collection): for fname in files: et = load_xml(fname) words = {} sentences = list(et.iter('s')) sentence_ids = [s.attrib['id'] for s in sentences] cur = w_collection.find({'_id': {'$in': sentence_ids}}) sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur} for sentence in progress(sentences, "load-text"): if sentence.attrib['id'] not in sentences_of_interest: continue w_id = 1 last_word_id = None sentence_id = None for w in sentence.iter(): if w.tag == 'w': last_word_id = w_id words[last_word_id] = [w.text, last_word_id, ''] w_id += 1 elif w.tag == pc_tag: last_word_id = w_id words[last_word_id] = [w.text, last_word_id, ''] w_id += 1 elif w.tag == 'c': if last_word_id: words[last_word_id][2] += w.text elif w.tag == 's': sentence_id = w.attrib['id'] yield (sentence_id, list(words.values())) words = {}