You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
luscenje_struktur/luscenje_struktur/loader.py

315 lines
10 KiB

import os
from xml.etree import ElementTree
import logging
import re
import sys
import gzip
import pathlib
from progress_bar import progress
from word import Word
def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args, database, w_collection=None, input_corpus=None):
filenames = input_corpus if input_corpus is not None else args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
if len(filenames) == 1 and os.path.isdir(filenames[0]):
filenames = [os.path.join(filenames[0], file) for file in os.listdir(filenames[0]) if file[-5:] != '.zstd']
database.init("CREATE TABLE Files ( filename varchar(2048) )")
for idx, fname in enumerate(filenames):
print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
extension = pathlib.Path(fname).suffix
# check if file with the same name already loaded...
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
if loaded is not None:
print("ALREADY LOADED")
continue
if extension == ".xml":
et = load_xml(fname)
if input_corpus is None:
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
else:
sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
for sent_id, sentence, othr_attributes in sentence_generator:
yield sent_id, sentence, othr_attributes
elif extension == ".gz":
if input_corpus is None:
yield load_csv(fname, True)
else:
sentences = load_csv_valency(fname, True, w_collection)
for sentence in sentences:
yield sentence
else:
if input_corpus is None:
yield load_csv(fname, False)
else:
sentences = load_csv_valency(fname, False, w_collection)
for sentence in sentences:
yield sentence
database.execute("INSERT INTO Files (filename) VALUES (?)", (fname,))
database.commit()
def lines_gz(filename):
with gzip.open(filename, 'r') as fp:
for line in progress(fp, 'load-gz'):
yield line.decode('utf8')
def lines_csv(filename):
with open(filename, 'r') as fp:
for line in progress(fp, 'load-csv'):
yield line
def load_csv(filename, compressed):
result = []
bad_sentence = False
words = {}
links = []
def sentence_end(bad_sentence):
if bad_sentence:
return
for lfrom, ldest, ana in links:
if lfrom not in words or ldest not in words:
logging.warning("Bad link in sentence: " + line_split[0])
continue
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
line_gen = lines_gz if compressed else lines_csv
for line in line_gen(filename):
line_str = line.strip()
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0:
sentence_end(bad_sentence)
bad_sentence = False
links = []
words = {}
try:
sid, wid, text, msd, lemma, link_src, link_type = line_split
except ValueError:
bad_sentence = True
full_id = "{}.{}".format(sid, wid)
words[wid] = Word(lemma, msd, full_id, text, True)
if link_src != '0':
links.append((link_src, wid, link_type))
sentence_end(bad_sentence)
return result
def load_csv_valency(filename, compressed, w_collection):
# TODO skip sentences that are not in sentences of interest!!!
result = {}
bad_sentence = False
words = {}
links = []
idi = 0
def sentence_end(bad_sentence, sid):
if bad_sentence:
return
for lfrom, ldest, ana in links:
if lfrom not in words or ldest not in words:
logging.warning("Bad link in sentence: " + line_split[0])
continue
words[lfrom].add_link(ana, words[ldest])
result[sid] = list(words.values())
line_gen = lines_gz if compressed else lines_csv
for line in line_gen(filename):
line_str = line.strip()
line_fixed = line_str.replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0:
sentence_end(bad_sentence, sid)
bad_sentence = False
links = []
words = {}
idi = 0
try:
sid, wid, text, msd, lemma, link_src, link_type = line_split
except ValueError:
bad_sentence = True
full_id = "{}.{}".format(sid, wid)
words[wid] = Word(lemma, msd, full_id, text, True)
if not (len(text[0]) == 1 and re.match('^[\w]+$', text[0]) is None):
words[wid].idi = str(idi)
idi += 1
if link_src != '0':
links.append((link_src, wid, link_type))
sentence_end(bad_sentence, sid)
sentence_ids = list(result.keys())
cur = w_collection.find({'_id': {'$in': sentence_ids}})
cur = [c for c in cur]
unsorted_result = [(c['_id'], result[c['_id']], {k: v for k, v in c.items() if k != '_id'}) for c in cur]
return sorted(unsorted_result, key=lambda x: (x[0].split('.')[0], int(x[0].split('.')[1]), int(x[0].split('.')[2])))
def load_xml(filename):
with open(filename, 'r') as fp:
content = fp.read()
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
return ElementTree.XML(xmlstring)
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
words = {}
sentences = list(et.iter('s'))
for sentence in progress(sentences, "load-text"):
for w in sentence.iter("w"):
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
for pc in sentence.iter(pc_tag):
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
for l in sentence.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:8] != 'jos-syn:': # dont bother...
continue
ana = ana[8:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: {}".format(lfrom))
sys.exit(1)
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
return list(words.values())
def file_sentence_generator_valency(et, skip_id_check, do_msd_translate, pc_tag, w_collection):
words = {}
sentences = list(et.iter('s'))
sentence_ids = [s.attrib['id'] for s in sentences]
cur = w_collection.find({'_id': {'$in': sentence_ids}})
sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
for sentence in progress(sentences, "load-text"):
if sentence.attrib['id'] not in sentences_of_interest:
continue
idi = 0
last_word_id = None
for w in sentence.iter():
if w.tag == 'w':
last_word_id = w.get('id')
words[last_word_id] = Word.from_xml(w, do_msd_translate)
words[last_word_id].idi = str(idi)
idi += 1
elif w.tag == pc_tag:
last_word_id = w.get('id')
words[last_word_id] = Word.pc_word(w, do_msd_translate)
elif w.tag == 'c':
if last_word_id:
words[last_word_id].glue += w.text
for l in sentence.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:8] != 'jos-syn:': # dont bother...
continue
ana = ana[8:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: {}".format(lfrom))
sys.exit(1)
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
yield sentence.attrib['id'], list(words.values()), sentences_of_interest[sentence.attrib['id']]
words = {}
def file_sentence_glue_generator(files, pc_tag, w_collection):
for fname in files:
et = load_xml(fname)
words = {}
sentences = list(et.iter('s'))
sentence_ids = [s.attrib['id'] for s in sentences]
cur = w_collection.find({'_id': {'$in': sentence_ids}})
sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
for sentence in progress(sentences, "load-text"):
if sentence.attrib['id'] not in sentences_of_interest:
continue
w_id = 1
last_word_id = None
sentence_id = None
for w in sentence.iter():
if w.tag == 'w':
last_word_id = w_id
words[last_word_id] = [w.text, last_word_id, '']
w_id += 1
elif w.tag == pc_tag:
last_word_id = w_id
words[last_word_id] = [w.text, last_word_id, '']
w_id += 1
elif w.tag == 'c':
if last_word_id:
words[last_word_id][2] += w.text
elif w.tag == 's':
sentence_id = w.attrib['id']
yield (sentence_id, list(words.values()))
words = {}