diff --git a/src/loader.py b/src/loader.py
index cd8bd83..1d6f86e 100644
--- a/src/loader.py
+++ b/src/loader.py
@@ -14,8 +14,8 @@ def is_root_id(id_):
     return len(id_.split('.')) == 3
 
 
-def load_files(args, database):
-    filenames = args.input
+def load_files(args, database, w_collection=None, input_corpus=None):
+    filenames = input_corpus if input_corpus is not None else args.input
     skip_id_check = args.skip_id_check
     do_msd_translate = not args.no_msd_translate
 
@@ -29,22 +29,35 @@ def load_files(args, database):
         extension = pathlib.Path(fname).suffix
 
         # check if file with the same name already loaded...
-        loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()
+        loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
         if loaded is not None:
             print("ALREADY LOADED")
             continue
 
         if extension == ".xml":
             et = load_xml(fname)
-            yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
+            if input_corpus is None:
+                yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
+            else:
+                sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
+                for sent_id, sentence, othr_attributes in sentence_generator:
+                    yield sent_id, sentence, othr_attributes
         elif extension == ".gz":
-            yield load_csv(fname, True)
+            if input_corpus is None:
+                yield load_csv(fname, True)
+            else:
+                sentences = load_csv_valency(fname, True, w_collection)
+                for sentence in sentences:
+                    yield sentence
         else:
-            yield load_csv(fname, False)
-        # else:
-        #     raise NotImplementedError("Unknown file extension: {}".format(extension))
+            if input_corpus is None:
+                yield load_csv(fname, False)
+            else:
+                sentences = load_csv_valency(fname, False, w_collection)
+                for sentence in sentences:
+                    yield sentence
 
-        database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))
+        database.execute("INSERT INTO Files (filename) VALUES (?)", (fname,))
         database.commit()
 
 
@@ -95,14 +108,70 @@ def load_csv(filename, compressed):
         except ValueError:
             bad_sentence = True
         full_id = "{}.{}".format(sid, wid)
-        
+
         words[wid] = Word(lemma, msd, full_id, text, True)
         if link_src != '0':
             links.append((link_src, wid, link_type))
-    
+
     sentence_end(bad_sentence)
     return result
 
+
+def load_csv_valency(filename, compressed, w_collection):
+    # TODO skip sentences that are not in sentences of interest!!!
+    result = {}
+    bad_sentence = False
+
+    words = {}
+    links = []
+    idi = 0
+
+    def sentence_end(bad_sentence, sid):
+        if bad_sentence:
+            return
+
+        for lfrom, ldest, ana in links:
+            if lfrom not in words or ldest not in words:
+                logging.warning("Bad link in sentence: " + line_split[0])
+                continue
+            words[lfrom].add_link(ana, words[ldest])
+        result[sid] = list(words.values())
+
+    line_gen = lines_gz if compressed else lines_csv
+    for line in line_gen(filename):
+        line_str = line.strip()
+        line_fixed = line_str.replace('\t\t\t', '\t,\t')
+        line_split = line_fixed.split("\t")
+
+        if line_split[1] == "1" and len(words) > 0:
+            sentence_end(bad_sentence, sid)
+            bad_sentence = False
+            links = []
+            words = {}
+            idi = 0
+
+        try:
+            sid, wid, text, msd, lemma, link_src, link_type = line_split
+        except ValueError:
+            bad_sentence = True
+        full_id = "{}.{}".format(sid, wid)
+
+        words[wid] = Word(lemma, msd, full_id, text, True)
+        if not (len(text[0]) == 1 and re.match('^[\w]+$', text[0]) is None):
+            words[wid].idi = str(idi)
+            idi += 1
+
+        if link_src != '0':
+            links.append((link_src, wid, link_type))
+
+    sentence_end(bad_sentence, sid)
+
+    sentence_ids = list(result.keys())
+    cur = w_collection.find({'_id': {'$in': sentence_ids}})
+    cur = [c for c in cur]
+    unsorted_result = [(c['_id'], result[c['_id']], {k: v for k, v in c.items() if k != '_id'}) for c in cur]
+    return sorted(unsorted_result, key=lambda x: (x[0].split('.')[0], int(x[0].split('.')[1]), int(x[0].split('.')[2])))
+
 def load_xml(filename):
     with open(filename, 'r') as fp:
         content = fp.read()
@@ -150,3 +219,96 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
                 pass
 
     return list(words.values())
+
+
+
+def file_sentence_generator_valency(et, skip_id_check, do_msd_translate, pc_tag, w_collection):
+    words = {}
+    sentences = list(et.iter('s'))
+    sentence_ids = [s.attrib['id'] for s in sentences]
+    cur = w_collection.find({'_id': {'$in': sentence_ids}})
+    sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
+
+    for sentence in progress(sentences, "load-text"):
+        if sentence.attrib['id'] not in sentences_of_interest:
+            continue
+        idi = 0
+        last_word_id = None
+        for w in sentence.iter():
+            if w.tag == 'w':
+                last_word_id = w.get('id')
+                words[last_word_id] = Word.from_xml(w, do_msd_translate)
+                words[last_word_id].idi = str(idi)
+                idi += 1
+            elif w.tag == pc_tag:
+                last_word_id = w.get('id')
+                words[last_word_id] = Word.pc_word(w, do_msd_translate)
+            elif w.tag == 'c':
+                if last_word_id:
+                    words[last_word_id].glue += w.text
+
+        for l in sentence.iter("link"):
+            if 'dep' in l.keys():
+                ana = l.get('afun')
+                lfrom = l.get('from')
+                dest = l.get('dep')
+            else:
+                ana = l.get('ana')
+                if ana[:8] != 'jos-syn:':  # dont bother...
+                    continue
+                ana = ana[8:]
+                lfrom, dest = l.get('target').replace('#', '').split()
+
+            if lfrom in words:
+                if not skip_id_check and is_root_id(lfrom):
+                    logging.error("NOO: {}".format(lfrom))
+                    sys.exit(1)
+
+                if dest in words:
+                    next_word = words[dest]
+                    words[lfrom].add_link(ana, next_word)
+                else:
+                    logging.error("Unknown id: {}".format(dest))
+                    sys.exit(1)
+
+            else:
+                # strange errors, just skip...
+                pass
+        yield sentence.attrib['id'], list(words.values()), sentences_of_interest[sentence.attrib['id']]
+        words = {}
+
+
+def file_sentence_glue_generator(files, pc_tag, w_collection):
+    for fname in files:
+        et = load_xml(fname)
+
+        words = {}
+        sentences = list(et.iter('s'))
+
+        sentence_ids = [s.attrib['id'] for s in sentences]
+        cur = w_collection.find({'_id': {'$in': sentence_ids}})
+        sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
+
+        for sentence in progress(sentences, "load-text"):
+            if sentence.attrib['id'] not in sentences_of_interest:
+                continue
+            w_id = 1
+            last_word_id = None
+            sentence_id = None
+            for w in sentence.iter():
+                if w.tag == 'w':
+                    last_word_id = w_id
+                    words[last_word_id] = [w.text, last_word_id, '']
+                    w_id += 1
+                elif w.tag == pc_tag:
+                    last_word_id = w_id
+                    words[last_word_id] = [w.text, last_word_id, '']
+                    w_id += 1
+                elif w.tag == 'c':
+                    if last_word_id:
+                        words[last_word_id][2] += w.text
+                elif w.tag == 's':
+                    sentence_id = w.attrib['id']
+
+            yield (sentence_id, list(words.values()))
+            words = {}
diff --git a/src/word.py b/src/word.py
index f30522c..53f4036 100644
--- a/src/word.py
+++ b/src/word.py
@@ -4,6 +4,13 @@ import logging
 from msd_translate import MSD_TRANSLATE
 
 
+class WordCompressed:
+    def __init__(self, text, collocation, dependency_tree):
+        self.text = text
+        self.collocation = collocation
+        self.dependency_tree = dependency_tree
+
+
 class WordMsdOnly:
     def __init__(self, msd):
         self.msd = msd