diff --git a/src/loader.py b/src/loader.py index cd8bd83..1d6f86e 100644 --- a/src/loader.py +++ b/src/loader.py @@ -14,8 +14,8 @@ def is_root_id(id_): return len(id_.split('.')) == 3 -def load_files(args, database): - filenames = args.input +def load_files(args, database, w_collection=None, input_corpus=None): + filenames = input_corpus if input_corpus is not None else args.input skip_id_check = args.skip_id_check do_msd_translate = not args.no_msd_translate @@ -29,22 +29,35 @@ def load_files(args, database): extension = pathlib.Path(fname).suffix # check if file with the same name already loaded... - loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone() + loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone() if loaded is not None: print("ALREADY LOADED") continue if extension == ".xml": et = load_xml(fname) - yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) + if input_corpus is None: + yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) + else: + sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection) + for sent_id, sentence, othr_attributes in sentence_generator: + yield sent_id, sentence, othr_attributes elif extension == ".gz": - yield load_csv(fname, True) + if input_corpus is None: + yield load_csv(fname, True) + else: + sentences = load_csv_valency(fname, True, w_collection) + for sentence in sentences: + yield sentence else: - yield load_csv(fname, False) - # else: - # raise NotImplementedError("Unknown file extension: {}".format(extension)) + if input_corpus is None: + yield load_csv(fname, False) + else: + sentences = load_csv_valency(fname, False, w_collection) + for sentence in sentences: + yield sentence - database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, )) + database.execute("INSERT INTO Files (filename) VALUES (?)", (fname,)) database.commit() @@ -95,14 +108,70 @@ def load_csv(filename, compressed): except ValueError: bad_sentence = True full_id = "{}.{}".format(sid, wid) - + words[wid] = Word(lemma, msd, full_id, text, True) if link_src != '0': links.append((link_src, wid, link_type)) - + sentence_end(bad_sentence) return result + +def load_csv_valency(filename, compressed, w_collection): + # TODO skip sentences that are not in sentences of interest!!! + result = {} + bad_sentence = False + + words = {} + links = [] + idi = 0 + + def sentence_end(bad_sentence, sid): + if bad_sentence: + return + + for lfrom, ldest, ana in links: + if lfrom not in words or ldest not in words: + logging.warning("Bad link in sentence: " + line_split[0]) + continue + words[lfrom].add_link(ana, words[ldest]) + result[sid] = list(words.values()) + + line_gen = lines_gz if compressed else lines_csv + for line in line_gen(filename): + line_str = line.strip() + line_fixed = line_str.replace('\t\t\t', '\t,\t') + line_split = line_fixed.split("\t") + + if line_split[1] == "1" and len(words) > 0: + sentence_end(bad_sentence, sid) + bad_sentence = False + links = [] + words = {} + idi = 0 + + try: + sid, wid, text, msd, lemma, link_src, link_type = line_split + except ValueError: + bad_sentence = True + full_id = "{}.{}".format(sid, wid) + + words[wid] = Word(lemma, msd, full_id, text, True) + if not (len(text[0]) == 1 and re.match('^[\w]+$', text[0]) is None): + words[wid].idi = str(idi) + idi += 1 + + if link_src != '0': + links.append((link_src, wid, link_type)) + + sentence_end(bad_sentence, sid) + + sentence_ids = list(result.keys()) + cur = w_collection.find({'_id': {'$in': sentence_ids}}) + cur = [c for c in cur] + unsorted_result = [(c['_id'], result[c['_id']], {k: v for k, v in c.items() if k != '_id'}) for c in cur] + return sorted(unsorted_result, key=lambda x: (x[0].split('.')[0], int(x[0].split('.')[1]), int(x[0].split('.')[2]))) + def load_xml(filename): with open(filename, 'r') as fp: content = fp.read() @@ -150,3 +219,96 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): pass return list(words.values()) + + + +def file_sentence_generator_valency(et, skip_id_check, do_msd_translate, pc_tag, w_collection): + words = {} + sentences = list(et.iter('s')) + sentence_ids = [s.attrib['id'] for s in sentences] + cur = w_collection.find({'_id': {'$in': sentence_ids}}) + sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur} + + for sentence in progress(sentences, "load-text"): + if sentence.attrib['id'] not in sentences_of_interest: + continue + idi = 0 + last_word_id = None + for w in sentence.iter(): + if w.tag == 'w': + last_word_id = w.get('id') + words[last_word_id] = Word.from_xml(w, do_msd_translate) + words[last_word_id].idi = str(idi) + idi += 1 + elif w.tag == pc_tag: + last_word_id = w.get('id') + words[last_word_id] = Word.pc_word(w, do_msd_translate) + elif w.tag == 'c': + if last_word_id: + words[last_word_id].glue += w.text + + for l in sentence.iter("link"): + if 'dep' in l.keys(): + ana = l.get('afun') + lfrom = l.get('from') + dest = l.get('dep') + else: + ana = l.get('ana') + if ana[:8] != 'jos-syn:': # dont bother... + continue + ana = ana[8:] + lfrom, dest = l.get('target').replace('#', '').split() + + if lfrom in words: + if not skip_id_check and is_root_id(lfrom): + logging.error("NOO: {}".format(lfrom)) + sys.exit(1) + + if dest in words: + next_word = words[dest] + words[lfrom].add_link(ana, next_word) + else: + logging.error("Unknown id: {}".format(dest)) + sys.exit(1) + + else: + # strange errors, just skip... + pass + yield sentence.attrib['id'], list(words.values()), sentences_of_interest[sentence.attrib['id']] + words = {} + + +def file_sentence_glue_generator(files, pc_tag, w_collection): + for fname in files: + et = load_xml(fname) + + words = {} + sentences = list(et.iter('s')) + + sentence_ids = [s.attrib['id'] for s in sentences] + cur = w_collection.find({'_id': {'$in': sentence_ids}}) + sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur} + + for sentence in progress(sentences, "load-text"): + if sentence.attrib['id'] not in sentences_of_interest: + continue + w_id = 1 + last_word_id = None + sentence_id = None + for w in sentence.iter(): + if w.tag == 'w': + last_word_id = w_id + words[last_word_id] = [w.text, last_word_id, ''] + w_id += 1 + elif w.tag == pc_tag: + last_word_id = w_id + words[last_word_id] = [w.text, last_word_id, ''] + w_id += 1 + elif w.tag == 'c': + if last_word_id: + words[last_word_id][2] += w.text + elif w.tag == 's': + sentence_id = w.attrib['id'] + + yield (sentence_id, list(words.values())) + words = {} diff --git a/src/word.py b/src/word.py index f30522c..53f4036 100644 --- a/src/word.py +++ b/src/word.py @@ -4,6 +4,13 @@ import logging from msd_translate import MSD_TRANSLATE +class WordCompressed: + def __init__(self, text, collocation, dependency_tree): + self.text = text + self.collocation = collocation + self.dependency_tree = dependency_tree + + class WordMsdOnly: def __init__(self, msd): self.msd = msd