2019-06-17 13:38:55 +00:00
|
|
|
from xml.etree import ElementTree
|
|
|
|
import logging
|
|
|
|
import re
|
|
|
|
import sys
|
2019-06-17 18:41:11 +00:00
|
|
|
import gzip
|
|
|
|
import pathlib
|
2019-06-17 13:38:55 +00:00
|
|
|
|
2019-06-17 15:30:51 +00:00
|
|
|
from progress_bar import progress
|
2019-06-17 13:38:55 +00:00
|
|
|
from word import Word
|
|
|
|
|
|
|
|
|
|
|
|
def is_root_id(id_):
|
|
|
|
return len(id_.split('.')) == 3
|
|
|
|
|
|
|
|
|
2019-08-21 09:12:38 +00:00
|
|
|
def load_files(args, database):
|
2019-06-17 13:38:55 +00:00
|
|
|
filenames = args.input
|
|
|
|
skip_id_check = args.skip_id_check
|
|
|
|
do_msd_translate = not args.no_msd_translate
|
|
|
|
|
2019-08-21 09:12:38 +00:00
|
|
|
database.init("CREATE TABLE Files ( filename varchar(2048) )")
|
|
|
|
|
2019-07-03 08:23:18 +00:00
|
|
|
for idx, fname in enumerate(filenames):
|
|
|
|
print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
|
2019-06-17 18:41:11 +00:00
|
|
|
extension = pathlib.Path(fname).suffix
|
2019-06-17 13:38:55 +00:00
|
|
|
|
2019-08-21 09:12:38 +00:00
|
|
|
# check if file with the same name already loaded...
|
|
|
|
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()
|
|
|
|
if loaded is not None:
|
|
|
|
print("ALREADY LOADED")
|
|
|
|
continue
|
|
|
|
|
2019-06-17 18:41:11 +00:00
|
|
|
if extension == ".xml":
|
|
|
|
et = load_xml(fname)
|
|
|
|
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
|
|
|
elif extension == ".gz":
|
2019-08-21 09:09:47 +00:00
|
|
|
yield load_csv(fname, True)
|
2019-06-17 18:41:11 +00:00
|
|
|
else:
|
2019-08-21 09:09:47 +00:00
|
|
|
yield load_csv(fname, False)
|
|
|
|
# else:
|
|
|
|
# raise NotImplementedError("Unknown file extension: {}".format(extension))
|
|
|
|
|
2019-08-21 09:12:38 +00:00
|
|
|
database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))
|
|
|
|
database.commit()
|
|
|
|
|
|
|
|
|
2019-08-21 09:09:47 +00:00
|
|
|
def lines_gz(filename):
|
|
|
|
with gzip.open(filename, 'r') as fp:
|
|
|
|
for line in progress(fp, 'load-gz'):
|
|
|
|
yield line.decode('utf8')
|
|
|
|
|
|
|
|
|
|
|
|
def lines_csv(filename):
|
|
|
|
with open(filename, 'r') as fp:
|
|
|
|
for line in progress(fp, 'load-csv'):
|
|
|
|
yield line
|
2019-06-17 18:41:11 +00:00
|
|
|
|
|
|
|
|
2019-08-21 09:09:47 +00:00
|
|
|
def load_csv(filename, compressed):
|
2019-06-17 18:41:11 +00:00
|
|
|
result = []
|
|
|
|
bad_sentence = False
|
|
|
|
|
2019-06-26 11:06:43 +00:00
|
|
|
words = {}
|
|
|
|
links = []
|
|
|
|
|
|
|
|
def sentence_end(bad_sentence):
|
|
|
|
if bad_sentence:
|
|
|
|
return
|
|
|
|
|
|
|
|
for lfrom, ldest, ana in links:
|
|
|
|
if lfrom not in words or ldest not in words:
|
|
|
|
logging.warning("Bad link in sentence: " + line_split[0])
|
|
|
|
continue
|
|
|
|
words[lfrom].add_link(ana, words[ldest])
|
|
|
|
result.extend(words.values())
|
|
|
|
|
2019-08-21 09:09:47 +00:00
|
|
|
line_gen = lines_gz if compressed else lines_csv
|
|
|
|
for line in line_gen(filename):
|
|
|
|
line_str = line.strip()
|
|
|
|
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
|
|
|
|
line_split = line_fixed.split("\t")
|
|
|
|
|
|
|
|
if line_split[1] == "1" and len(words) > 0:
|
|
|
|
sentence_end(bad_sentence)
|
|
|
|
bad_sentence = False
|
|
|
|
links = []
|
|
|
|
words = {}
|
|
|
|
|
|
|
|
try:
|
|
|
|
sid, wid, text, msd, lemma, link_src, link_type = line_split
|
|
|
|
except ValueError:
|
|
|
|
bad_sentence = True
|
|
|
|
full_id = "{}.{}".format(sid, wid)
|
|
|
|
|
|
|
|
words[wid] = Word(lemma, msd, full_id, text, True)
|
|
|
|
if link_src != '0':
|
|
|
|
links.append((link_src, wid, link_type))
|
2019-06-17 18:41:11 +00:00
|
|
|
|
2019-06-26 11:06:43 +00:00
|
|
|
sentence_end(bad_sentence)
|
2019-06-17 18:41:11 +00:00
|
|
|
return result
|
2019-06-17 13:38:55 +00:00
|
|
|
|
|
|
|
def load_xml(filename):
|
|
|
|
with open(filename, 'r') as fp:
|
|
|
|
content = fp.read()
|
|
|
|
|
|
|
|
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
|
|
|
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
|
|
|
return ElementTree.XML(xmlstring)
|
|
|
|
|
|
|
|
|
|
|
|
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
|
|
|
words = {}
|
|
|
|
sentences = list(et.iter('s'))
|
2019-07-03 08:23:18 +00:00
|
|
|
for sentence in progress(sentences, "load-text"):
|
2019-06-17 13:38:55 +00:00
|
|
|
for w in sentence.iter("w"):
|
2019-06-17 18:41:11 +00:00
|
|
|
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
2019-06-17 13:38:55 +00:00
|
|
|
for pc in sentence.iter(pc_tag):
|
|
|
|
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
|
|
|
|
|
|
|
for l in sentence.iter("link"):
|
|
|
|
if 'dep' in l.keys():
|
|
|
|
ana = l.get('afun')
|
|
|
|
lfrom = l.get('from')
|
|
|
|
dest = l.get('dep')
|
|
|
|
else:
|
|
|
|
ana = l.get('ana')
|
|
|
|
if ana[:4] != 'syn:': # dont bother...
|
|
|
|
continue
|
|
|
|
ana = ana[4:]
|
|
|
|
lfrom, dest = l.get('target').replace('#', '').split()
|
|
|
|
|
|
|
|
if lfrom in words:
|
|
|
|
if not skip_id_check and is_root_id(lfrom):
|
|
|
|
logging.error("NOO: {}".format(lfrom))
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
if dest in words:
|
|
|
|
next_word = words[dest]
|
|
|
|
words[lfrom].add_link(ana, next_word)
|
|
|
|
else:
|
|
|
|
logging.error("Unknown id: {}".format(dest))
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
else:
|
|
|
|
# strange errors, just skip...
|
|
|
|
pass
|
|
|
|
|
2019-06-26 11:06:43 +00:00
|
|
|
return list(words.values())
|