Added txt2svala formatting script
This commit is contained in:
parent
530b6efe48
commit
4b88ada956
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
.idea/
|
||||
data/
|
||||
venv/
|
|
@ -180,6 +180,8 @@ def process_file(et, args):
|
|||
output_folder_loc = os.path.join(args.output_folder, file_name)
|
||||
error_folder_loc = os.path.join(args.error_folder, file_name)
|
||||
|
||||
essay_problematic = False
|
||||
|
||||
paragraphs = div.findall('p')
|
||||
for paragraph in paragraphs:
|
||||
sentences = paragraph.findall('s')
|
||||
|
@ -206,13 +208,21 @@ def process_file(et, args):
|
|||
if not paragraph_error:
|
||||
if not os.path.exists(output_folder_loc):
|
||||
os.mkdir(output_folder_loc)
|
||||
with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
|
||||
json.dump(dictionary, wf, ensure_ascii=False, indent="")
|
||||
else:
|
||||
if not os.path.exists(error_folder_loc):
|
||||
os.mkdir(error_folder_loc)
|
||||
with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
|
||||
json.dump(dictionary, wf, ensure_ascii=False, indent="")
|
||||
with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
|
||||
json.dump(dictionary, wf, ensure_ascii=False, indent="")
|
||||
else:
|
||||
essay_problematic = True
|
||||
if not os.path.exists(error_folder_loc):
|
||||
os.mkdir(error_folder_loc)
|
||||
with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_problem.json'), 'w') as wf:
|
||||
json.dump(dictionary, wf, ensure_ascii=False, indent="")
|
||||
|
||||
if not essay_problematic:
|
||||
shutil.rmtree(error_folder_loc)
|
||||
|
||||
|
||||
def main(args):
|
||||
|
|
59
txt2svala.py
Normal file
59
txt2svala.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
import obeliks
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
def add_token(ind, text, source, target, edges):
|
||||
source_id = "s" + ind
|
||||
source.append({"id": source_id, "text": text + " "})
|
||||
target_id = "t" + ind
|
||||
target.append({"id": target_id, "text": text + " "})
|
||||
edge_id = "e-" + source_id + "-" + target_id
|
||||
edges[edge_id] = {"id": edge_id, "ids": [source_id, target_id], "labels": [], "manual": False}
|
||||
|
||||
def paragraph_to_svala(paragraph):
|
||||
i = 1
|
||||
source = []
|
||||
target = []
|
||||
edges = {}
|
||||
for word in paragraph:
|
||||
add_token(str(i), word, source, target, edges)
|
||||
i += 1
|
||||
|
||||
return {"source": source, "target": target, "edges": edges}
|
||||
|
||||
|
||||
def process_file(file, args):
|
||||
file_path = os.path.join(args.input_folder, file)
|
||||
if os.path.exists(args.output_folder):
|
||||
shutil.rmtree(args.output_folder)
|
||||
os.mkdir(args.output_folder)
|
||||
with open(file_path, 'r') as fp:
|
||||
for i, line in enumerate(fp):
|
||||
tokenized = [token.split('\t')[1] for token in obeliks.run(line).split('\n') if len(token.split('\t')) > 1]
|
||||
dictionary = paragraph_to_svala(tokenized)
|
||||
with open(os.path.join(args.output_folder, file + str(i+1) + '.json'), 'w') as wf:
|
||||
json.dump(dictionary, wf, ensure_ascii=False, indent="")
|
||||
|
||||
def main(args):
|
||||
for file in os.listdir(args.input_folder):
|
||||
process_file(file, args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
||||
parser.add_argument('--input_folder', default='data/txt/input',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
parser.add_argument('--output_folder', default='data/txt/output',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
args = parser.parse_args()
|
||||
|
||||
start = time.time()
|
||||
main(args)
|
||||
logging.info("TIME: {}".format(time.time() - start))
|
Loading…
Reference in New Issue
Block a user