You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

422 lines
15 KiB

import os
from xml.etree import ElementTree
import logging
import re
import sys
import gzip
import pathlib
from io import StringIO
from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word
def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args, database, w_collection=None, input_corpus=None):
filenames = input_corpus if input_corpus is not None else args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
if len(filenames) == 1 and os.path.isdir(filenames[0]):
filenames = [os.path.join(filenames[0], file) for file in os.listdir(filenames[0]) if file[-5:] != '.zstd']
if len(filenames) > 1:
filenames = [filename for filename in filenames if filename[-5:] != '.zstd']
filenames = sorted(filenames, key=lambda x: int(x.split('.')[-1]))
database.init("CREATE TABLE Files ( filename varchar(2048) )")
for idx, fname in enumerate(filenames):"FILE " + fname + "{}/{}".format(idx, len(filenames)))
extension = pathlib.Path(fname).suffix
# check if file with the same name already loaded...
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
if loaded is not None:"ALREADY LOADED")
if extension == ".xml":
et = load_xml(fname)
if input_corpus is None:
yield file_sentence_generator(et, args)
sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
for sent_id, sentence, othr_attributes in sentence_generator:
yield sent_id, sentence, othr_attributes
elif extension == ".gz":
if input_corpus is None:
yield load_csv(fname, True)
sentences = load_csv_valency(fname, True, w_collection)
for sentence in sentences:
yield sentence
elif extension == ".conllu":
if input_corpus is None:
yield load_conllu(fname)
raise Exception('conllu with input_corpus is not supported!')
if input_corpus is None:
yield load_csv(fname, False)
sentences = load_csv_valency(fname, False, w_collection)
for sentence in sentences:
yield sentence
database.execute("INSERT INTO Files (filename) VALUES (?)", (fname,))
def lines_gz(filename):
with, 'r') as fp:
for line in progress(fp, 'load-gz'):
yield line.decode('utf8')
def lines_csv(filename):
with open(filename, 'r') as fp:
for line in progress(fp, 'load-csv'):
yield line
def load_conllu(filename):
import conllu
result = []
bad_sentence = False
words = {}
links = []
def sentence_end(bad_sentence, sent_id):
if bad_sentence:
for lfrom, ldest, ana in links:
if lfrom not in words or ldest not in words:
logging.warning("Bad link in sentence: " + sent_id)
words[lfrom].add_link(ana, words[ldest])
with open(filename, 'r') as f:
data =
# conlls = conllu.parse_incr(StringIO(data))
# for sent in conlls:
# try:
# for word in sent:
# full_id = "{}.{}".format(sent.metadata['sent_id'], str(word['id']))
# words[str(word['id'])] = Word(word['id'], word['xpos'], full_id, word['form'], False)
# except:
# logging.error(f"Error while reading file {filename} in sentence {sent.metadata['sent_id']}. Check if required data is available!")
conlls = conllu.parse_incr(StringIO(data))
# build dep parse
for sent in conlls:
# adding fake word
words['0'] = Word('', '', '0', '', False, True)
for word in sent:
if type(word['id']) == tuple:
full_id = "{}.{}".format(sent.metadata['sent_id'], str(word['id']))
words[str(word['id'])] = Word(word['lemma'], word['upos'], full_id, word['form'], False)
links.append((str(word['head']), str(word['id']), word['deprel']))
sentence_end(False, sent.metadata['sent_id'])
links = []
words = {}
links = []
words = {}
logging.error(f"Error while reading file {filename} in sentence {sent.metadata['sent_id']}. Check if required data is available!")
return result
def load_csv(filename, compressed):
result = []
bad_sentence = False
words = {}
links = []
def sentence_end(bad_sentence):
if bad_sentence:
for lfrom, ldest, ana in links:
if lfrom not in words or ldest not in words:
logging.warning("Bad link in sentence: " + line_split[0])
words[lfrom].add_link(ana, words[ldest])
line_gen = lines_gz if compressed else lines_csv
for line in line_gen(filename):
line_str = line.strip()
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0:
# adding fake word
words['0'] = Word('', '', '0', '', False, True)
bad_sentence = False
links = []
words = {}
sid, wid, text, msd, lemma, link_src, link_type = line_split
except ValueError:
bad_sentence = True
full_id = "{}.{}".format(sid, wid)
words[wid] = Word(lemma, msd, full_id, text, True)
# if link_src != '0':
links.append((link_src, wid, link_type))
# adding fake word
words['0'] = Word('', '', '0', '', False, True)
return result
def load_csv_valency(filename, compressed, w_collection):
# TODO skip sentences that are not in sentences of interest!!!
result = {}
bad_sentence = False
words = {}
links = []
idi = 0
def sentence_end(bad_sentence, sid):
if bad_sentence:
for lfrom, ldest, ana in links:
if lfrom not in words or ldest not in words:
logging.warning("Bad link in sentence: " + line_split[0])
words[lfrom].add_link(ana, words[ldest])
result[sid] = list(words.values())
line_gen = lines_gz if compressed else lines_csv
for line in line_gen(filename):
line_str = line.strip()
line_fixed = line_str.replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0:
sentence_end(bad_sentence, sid)
bad_sentence = False
links = []
words = {}
idi = 0
sid, wid, text, msd, lemma, link_src, link_type = line_split
except ValueError:
bad_sentence = True
full_id = "{}.{}".format(sid, wid)
words[wid] = Word(lemma, msd, full_id, text, True)
if not (len(text[0]) == 1 and re.match('^[\w]+$', text[0]) is None):
words[wid].idi = str(idi)
idi += 1
if link_src != '0':
links.append((link_src, wid, link_type))
sentence_end(bad_sentence, sid)
sentence_ids = list(result.keys())
cur = w_collection.find({'_id': {'$in': sentence_ids}})
cur = [c for c in cur]
unsorted_result = [(c['_id'], result[c['_id']], {k: v for k, v in c.items() if k != '_id'}) for c in cur]
return sorted(unsorted_result, key=lambda x: (x[0].split('.')[0], int(x[0].split('.')[1]), int(x[0].split('.')[2])))
def load_xml(filename):
with open(filename, 'r') as fp:
content =
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
return ElementTree.XML(xmlstring)
def file_sentence_generator(et, args):
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
pc_tag = args.pc_tag
use_punctuations = not args.ignore_punctuations
previous_pc = False
words = {}
paragraphs = list(et.iter('p'))
for paragraph in progress(paragraphs, "load-text"):
previous_glue = ''
sentences = list(paragraph.iter('s'))
for sentence in sentences:
# create fake root word
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
last_word_id = None
if args.new_tei:
for w in sentence.iter():
if w.tag == 'w':
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
if use_punctuations:
previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
elif w.tag == pc_tag:
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
if use_punctuations:
words[w.get('id')].previous_glue = previous_glue
words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
for w in sentence.iter():
if w.tag == 'w':
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
if use_punctuations:
previous_glue = ''
last_word_id = None
elif w.tag == pc_tag:
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
if use_punctuations:
last_word_id = w.get('id')
words[w.get('id')].previous_glue = previous_glue
previous_glue = ''
elif use_punctuations and w.tag == 'c':
# always save previous glue
previous_glue = w.text
if last_word_id:
words[last_word_id].glue += w.text
for l in sentence.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
ana = l.get('ana')
if ana[:8] != 'jos-syn:': # dont bother...
ana = ana[8:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom))
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
logging.error("Unknown id: {}".format(dest))
# strange errors, just skip...
a = list(words.values())
return list(words.values())
def file_sentence_generator_valency(et, skip_id_check, do_msd_translate, pc_tag, w_collection):
words = {}
sentences = list(et.iter('s'))
sentence_ids = [s.attrib['id'] for s in sentences]
cur = w_collection.find({'_id': {'$in': sentence_ids}})
sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
for sentence in progress(sentences, "load-text"):
if sentence.attrib['id'] not in sentences_of_interest:
idi = 0
last_word_id = None
for w in sentence.iter():
if w.tag == 'w':
last_word_id = w.get('id')
words[last_word_id] = Word.from_xml(w, do_msd_translate)
words[last_word_id].idi = str(idi)
idi += 1
elif w.tag == pc_tag:
last_word_id = w.get('id')
words[last_word_id] = Word.pc_word(w, do_msd_translate)
elif w.tag == 'c':
if last_word_id:
words[last_word_id].glue += w.text
for l in sentence.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
ana = l.get('ana')
if ana[:8] != 'jos-syn:': # dont bother...
ana = ana[8:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: {}".format(lfrom))
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
logging.error("Unknown id: {}".format(dest))
# strange errors, just skip...
yield sentence.attrib['id'], list(words.values()), sentences_of_interest[sentence.attrib['id']]
words = {}
def file_sentence_glue_generator(files, pc_tag, w_collection):
for fname in files:
et = load_xml(fname)
words = {}
sentences = list(et.iter('s'))
sentence_ids = [s.attrib['id'] for s in sentences]
cur = w_collection.find({'_id': {'$in': sentence_ids}})
sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
for sentence in progress(sentences, "load-text"):
if sentence.attrib['id'] not in sentences_of_interest:
w_id = 1
last_word_id = None
sentence_id = None
for w in sentence.iter():
if w.tag == 'w':
last_word_id = w_id
words[last_word_id] = [w.text, last_word_id, '']
w_id += 1
elif w.tag == pc_tag:
last_word_id = w_id
words[last_word_id] = [w.text, last_word_id, '']
w_id += 1
elif w.tag == 'c':
if last_word_id:
words[last_word_id][2] += w.text
elif w.tag == 's':
sentence_id = w.attrib['id']
yield (sentence_id, list(words.values()))
words = {}