Added txt2svala formatting script

2021-10-01 14:36:43 +02:00 · 2021-10-01 14:36:43 +02:00 · 4b88ada956
commit 4b88ada956
parent 530b6efe48
3 changed files with 73 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .idea/
 data/
+venv/
--- a/solar2svala.py
+++ b/solar2svala.py
@ -180,6 +180,8 @@ def process_file(et, args):
        output_folder_loc = os.path.join(args.output_folder, file_name)
        error_folder_loc = os.path.join(args.error_folder, file_name)

+        essay_problematic = False
+
        paragraphs = div.findall('p')
        for paragraph in paragraphs:
            sentences = paragraph.findall('s')
@ -206,13 +208,21 @@ def process_file(et, args):
            if not paragraph_error:
                if not os.path.exists(output_folder_loc):
                    os.mkdir(output_folder_loc)
-                with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
-                    json.dump(dictionary, wf, ensure_ascii=False, indent="")
-            else:
                if not os.path.exists(error_folder_loc):
                    os.mkdir(error_folder_loc)
+                with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
+                    json.dump(dictionary, wf, ensure_ascii=False, indent="")
                with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
                    json.dump(dictionary, wf, ensure_ascii=False, indent="")
+            else:
+                essay_problematic = True
+                if not os.path.exists(error_folder_loc):
+                    os.mkdir(error_folder_loc)
+                with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_problem.json'), 'w') as wf:
+                    json.dump(dictionary, wf, ensure_ascii=False, indent="")
+
+        if not essay_problematic:
+            shutil.rmtree(error_folder_loc)


 def main(args):
--- a/txt2svala.py
+++ b/txt2svala.py
@ -0,0 +1,59 @@
+import argparse
+import json
+import logging
+import os
+import shutil
+import time
+import obeliks
+
+logging.basicConfig(level=logging.INFO)
+
+def add_token(ind, text, source, target, edges):
+    source_id = "s" + ind
+    source.append({"id": source_id, "text": text + " "})
+    target_id = "t" + ind
+    target.append({"id": target_id, "text": text + " "})
+    edge_id = "e-" + source_id + "-" + target_id
+    edges[edge_id] = {"id": edge_id, "ids": [source_id, target_id], "labels": [], "manual": False}
+
+def paragraph_to_svala(paragraph):
+    i = 1
+    source = []
+    target = []
+    edges = {}
+    for word in paragraph:
+        add_token(str(i), word, source, target, edges)
+        i += 1
+
+    return {"source": source, "target": target, "edges": edges}
+
+
+def process_file(file, args):
+    file_path = os.path.join(args.input_folder, file)
+    if os.path.exists(args.output_folder):
+        shutil.rmtree(args.output_folder)
+    os.mkdir(args.output_folder)
+    with open(file_path, 'r') as fp:
+        for i, line in enumerate(fp):
+            tokenized = [token.split('\t')[1] for token in obeliks.run(line).split('\n') if len(token.split('\t')) > 1]
+            dictionary = paragraph_to_svala(tokenized)
+            with open(os.path.join(args.output_folder, file + str(i+1) + '.json'), 'w') as wf:
+                json.dump(dictionary, wf, ensure_ascii=False, indent="")
+
+def main(args):
+    for file in os.listdir(args.input_folder):
+        process_file(file, args)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
+    parser.add_argument('--input_folder', default='data/txt/input',
+                        help='input file in (gz or xml currently). If none, then just database is loaded')
+    parser.add_argument('--output_folder', default='data/txt/output',
+                        help='input file in (gz or xml currently). If none, then just database is loaded')
+    args = parser.parse_args()
+
+    start = time.time()
+    main(args)
+    logging.info("TIME: {}".format(time.time() - start))