Added support for valency
This commit is contained in:
parent
e38ff4c7b0
commit
41952738ed
184
src/loader.py
184
src/loader.py
|
@ -14,8 +14,8 @@ def is_root_id(id_):
|
||||||
return len(id_.split('.')) == 3
|
return len(id_.split('.')) == 3
|
||||||
|
|
||||||
|
|
||||||
def load_files(args, database):
|
def load_files(args, database, w_collection=None, input_corpus=None):
|
||||||
filenames = args.input
|
filenames = input_corpus if input_corpus is not None else args.input
|
||||||
skip_id_check = args.skip_id_check
|
skip_id_check = args.skip_id_check
|
||||||
do_msd_translate = not args.no_msd_translate
|
do_msd_translate = not args.no_msd_translate
|
||||||
|
|
||||||
|
@ -29,22 +29,35 @@ def load_files(args, database):
|
||||||
extension = pathlib.Path(fname).suffix
|
extension = pathlib.Path(fname).suffix
|
||||||
|
|
||||||
# check if file with the same name already loaded...
|
# check if file with the same name already loaded...
|
||||||
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()
|
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
|
||||||
if loaded is not None:
|
if loaded is not None:
|
||||||
print("ALREADY LOADED")
|
print("ALREADY LOADED")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if extension == ".xml":
|
if extension == ".xml":
|
||||||
et = load_xml(fname)
|
et = load_xml(fname)
|
||||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
if input_corpus is None:
|
||||||
|
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||||
|
else:
|
||||||
|
sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
|
||||||
|
for sent_id, sentence, othr_attributes in sentence_generator:
|
||||||
|
yield sent_id, sentence, othr_attributes
|
||||||
elif extension == ".gz":
|
elif extension == ".gz":
|
||||||
yield load_csv(fname, True)
|
if input_corpus is None:
|
||||||
|
yield load_csv(fname, True)
|
||||||
|
else:
|
||||||
|
sentences = load_csv_valency(fname, True, w_collection)
|
||||||
|
for sentence in sentences:
|
||||||
|
yield sentence
|
||||||
else:
|
else:
|
||||||
yield load_csv(fname, False)
|
if input_corpus is None:
|
||||||
# else:
|
yield load_csv(fname, False)
|
||||||
# raise NotImplementedError("Unknown file extension: {}".format(extension))
|
else:
|
||||||
|
sentences = load_csv_valency(fname, False, w_collection)
|
||||||
|
for sentence in sentences:
|
||||||
|
yield sentence
|
||||||
|
|
||||||
database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))
|
database.execute("INSERT INTO Files (filename) VALUES (?)", (fname,))
|
||||||
database.commit()
|
database.commit()
|
||||||
|
|
||||||
|
|
||||||
|
@ -95,14 +108,70 @@ def load_csv(filename, compressed):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
bad_sentence = True
|
bad_sentence = True
|
||||||
full_id = "{}.{}".format(sid, wid)
|
full_id = "{}.{}".format(sid, wid)
|
||||||
|
|
||||||
words[wid] = Word(lemma, msd, full_id, text, True)
|
words[wid] = Word(lemma, msd, full_id, text, True)
|
||||||
if link_src != '0':
|
if link_src != '0':
|
||||||
links.append((link_src, wid, link_type))
|
links.append((link_src, wid, link_type))
|
||||||
|
|
||||||
sentence_end(bad_sentence)
|
sentence_end(bad_sentence)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def load_csv_valency(filename, compressed, w_collection):
|
||||||
|
# TODO skip sentences that are not in sentences of interest!!!
|
||||||
|
result = {}
|
||||||
|
bad_sentence = False
|
||||||
|
|
||||||
|
words = {}
|
||||||
|
links = []
|
||||||
|
idi = 0
|
||||||
|
|
||||||
|
def sentence_end(bad_sentence, sid):
|
||||||
|
if bad_sentence:
|
||||||
|
return
|
||||||
|
|
||||||
|
for lfrom, ldest, ana in links:
|
||||||
|
if lfrom not in words or ldest not in words:
|
||||||
|
logging.warning("Bad link in sentence: " + line_split[0])
|
||||||
|
continue
|
||||||
|
words[lfrom].add_link(ana, words[ldest])
|
||||||
|
result[sid] = list(words.values())
|
||||||
|
|
||||||
|
line_gen = lines_gz if compressed else lines_csv
|
||||||
|
for line in line_gen(filename):
|
||||||
|
line_str = line.strip()
|
||||||
|
line_fixed = line_str.replace('\t\t\t', '\t,\t')
|
||||||
|
line_split = line_fixed.split("\t")
|
||||||
|
|
||||||
|
if line_split[1] == "1" and len(words) > 0:
|
||||||
|
sentence_end(bad_sentence, sid)
|
||||||
|
bad_sentence = False
|
||||||
|
links = []
|
||||||
|
words = {}
|
||||||
|
idi = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
sid, wid, text, msd, lemma, link_src, link_type = line_split
|
||||||
|
except ValueError:
|
||||||
|
bad_sentence = True
|
||||||
|
full_id = "{}.{}".format(sid, wid)
|
||||||
|
|
||||||
|
words[wid] = Word(lemma, msd, full_id, text, True)
|
||||||
|
if not (len(text[0]) == 1 and re.match('^[\w]+$', text[0]) is None):
|
||||||
|
words[wid].idi = str(idi)
|
||||||
|
idi += 1
|
||||||
|
|
||||||
|
if link_src != '0':
|
||||||
|
links.append((link_src, wid, link_type))
|
||||||
|
|
||||||
|
sentence_end(bad_sentence, sid)
|
||||||
|
|
||||||
|
sentence_ids = list(result.keys())
|
||||||
|
cur = w_collection.find({'_id': {'$in': sentence_ids}})
|
||||||
|
cur = [c for c in cur]
|
||||||
|
unsorted_result = [(c['_id'], result[c['_id']], {k: v for k, v in c.items() if k != '_id'}) for c in cur]
|
||||||
|
return sorted(unsorted_result, key=lambda x: (x[0].split('.')[0], int(x[0].split('.')[1]), int(x[0].split('.')[2])))
|
||||||
|
|
||||||
def load_xml(filename):
|
def load_xml(filename):
|
||||||
with open(filename, 'r') as fp:
|
with open(filename, 'r') as fp:
|
||||||
content = fp.read()
|
content = fp.read()
|
||||||
|
@ -150,3 +219,96 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return list(words.values())
|
return list(words.values())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def file_sentence_generator_valency(et, skip_id_check, do_msd_translate, pc_tag, w_collection):
|
||||||
|
words = {}
|
||||||
|
sentences = list(et.iter('s'))
|
||||||
|
sentence_ids = [s.attrib['id'] for s in sentences]
|
||||||
|
cur = w_collection.find({'_id': {'$in': sentence_ids}})
|
||||||
|
sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
|
||||||
|
|
||||||
|
for sentence in progress(sentences, "load-text"):
|
||||||
|
if sentence.attrib['id'] not in sentences_of_interest:
|
||||||
|
continue
|
||||||
|
idi = 0
|
||||||
|
last_word_id = None
|
||||||
|
for w in sentence.iter():
|
||||||
|
if w.tag == 'w':
|
||||||
|
last_word_id = w.get('id')
|
||||||
|
words[last_word_id] = Word.from_xml(w, do_msd_translate)
|
||||||
|
words[last_word_id].idi = str(idi)
|
||||||
|
idi += 1
|
||||||
|
elif w.tag == pc_tag:
|
||||||
|
last_word_id = w.get('id')
|
||||||
|
words[last_word_id] = Word.pc_word(w, do_msd_translate)
|
||||||
|
elif w.tag == 'c':
|
||||||
|
if last_word_id:
|
||||||
|
words[last_word_id].glue += w.text
|
||||||
|
|
||||||
|
for l in sentence.iter("link"):
|
||||||
|
if 'dep' in l.keys():
|
||||||
|
ana = l.get('afun')
|
||||||
|
lfrom = l.get('from')
|
||||||
|
dest = l.get('dep')
|
||||||
|
else:
|
||||||
|
ana = l.get('ana')
|
||||||
|
if ana[:8] != 'jos-syn:': # dont bother...
|
||||||
|
continue
|
||||||
|
ana = ana[8:]
|
||||||
|
lfrom, dest = l.get('target').replace('#', '').split()
|
||||||
|
|
||||||
|
if lfrom in words:
|
||||||
|
if not skip_id_check and is_root_id(lfrom):
|
||||||
|
logging.error("NOO: {}".format(lfrom))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if dest in words:
|
||||||
|
next_word = words[dest]
|
||||||
|
words[lfrom].add_link(ana, next_word)
|
||||||
|
else:
|
||||||
|
logging.error("Unknown id: {}".format(dest))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# strange errors, just skip...
|
||||||
|
pass
|
||||||
|
yield sentence.attrib['id'], list(words.values()), sentences_of_interest[sentence.attrib['id']]
|
||||||
|
words = {}
|
||||||
|
|
||||||
|
|
||||||
|
def file_sentence_glue_generator(files, pc_tag, w_collection):
|
||||||
|
for fname in files:
|
||||||
|
et = load_xml(fname)
|
||||||
|
|
||||||
|
words = {}
|
||||||
|
sentences = list(et.iter('s'))
|
||||||
|
|
||||||
|
sentence_ids = [s.attrib['id'] for s in sentences]
|
||||||
|
cur = w_collection.find({'_id': {'$in': sentence_ids}})
|
||||||
|
sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
|
||||||
|
|
||||||
|
for sentence in progress(sentences, "load-text"):
|
||||||
|
if sentence.attrib['id'] not in sentences_of_interest:
|
||||||
|
continue
|
||||||
|
w_id = 1
|
||||||
|
last_word_id = None
|
||||||
|
sentence_id = None
|
||||||
|
for w in sentence.iter():
|
||||||
|
if w.tag == 'w':
|
||||||
|
last_word_id = w_id
|
||||||
|
words[last_word_id] = [w.text, last_word_id, '']
|
||||||
|
w_id += 1
|
||||||
|
elif w.tag == pc_tag:
|
||||||
|
last_word_id = w_id
|
||||||
|
words[last_word_id] = [w.text, last_word_id, '']
|
||||||
|
w_id += 1
|
||||||
|
elif w.tag == 'c':
|
||||||
|
if last_word_id:
|
||||||
|
words[last_word_id][2] += w.text
|
||||||
|
elif w.tag == 's':
|
||||||
|
sentence_id = w.attrib['id']
|
||||||
|
|
||||||
|
yield (sentence_id, list(words.values()))
|
||||||
|
words = {}
|
||||||
|
|
|
@ -4,6 +4,13 @@ import logging
|
||||||
from msd_translate import MSD_TRANSLATE
|
from msd_translate import MSD_TRANSLATE
|
||||||
|
|
||||||
|
|
||||||
|
class WordCompressed:
|
||||||
|
def __init__(self, text, collocation, dependency_tree):
|
||||||
|
self.text = text
|
||||||
|
self.collocation = collocation
|
||||||
|
self.dependency_tree = dependency_tree
|
||||||
|
|
||||||
|
|
||||||
class WordMsdOnly:
|
class WordMsdOnly:
|
||||||
def __init__(self, msd):
|
def __init__(self, msd):
|
||||||
self.msd = msd
|
self.msd = msd
|
||||||
|
|
Loading…
Reference in New Issue
Block a user