diff --git a/.gitignore b/.gitignore index dddcc6c..a44f149 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .idea/ data/ +venv/ \ No newline at end of file diff --git a/solar2svala.py b/solar2svala.py index d2a74f5..97b28b7 100644 --- a/solar2svala.py +++ b/solar2svala.py @@ -180,6 +180,8 @@ def process_file(et, args): output_folder_loc = os.path.join(args.output_folder, file_name) error_folder_loc = os.path.join(args.error_folder, file_name) + essay_problematic = False + paragraphs = div.findall('p') for paragraph in paragraphs: sentences = paragraph.findall('s') @@ -206,14 +208,22 @@ def process_file(et, args): if not paragraph_error: if not os.path.exists(output_folder_loc): os.mkdir(output_folder_loc) + if not os.path.exists(error_folder_loc): + os.mkdir(error_folder_loc) with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf: json.dump(dictionary, wf, ensure_ascii=False, indent="") + with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf: + json.dump(dictionary, wf, ensure_ascii=False, indent="") else: + essay_problematic = True if not os.path.exists(error_folder_loc): os.mkdir(error_folder_loc) - with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf: + with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '_problem.json'), 'w') as wf: json.dump(dictionary, wf, ensure_ascii=False, indent="") + if not essay_problematic: + shutil.rmtree(error_folder_loc) + def main(args): with open(args.input_file, 'r') as fp: diff --git a/txt2svala.py b/txt2svala.py new file mode 100644 index 0000000..2ffb640 --- /dev/null +++ b/txt2svala.py @@ -0,0 +1,59 @@ +import argparse +import json +import logging +import os +import shutil +import time +import obeliks + +logging.basicConfig(level=logging.INFO) + +def add_token(ind, text, source, target, edges): + source_id = "s" + ind + source.append({"id": source_id, "text": text + " "}) + target_id = "t" + ind + target.append({"id": target_id, "text": text + " "}) + edge_id = "e-" + source_id + "-" + target_id + edges[edge_id] = {"id": edge_id, "ids": [source_id, target_id], "labels": [], "manual": False} + +def paragraph_to_svala(paragraph): + i = 1 + source = [] + target = [] + edges = {} + for word in paragraph: + add_token(str(i), word, source, target, edges) + i += 1 + + return {"source": source, "target": target, "edges": edges} + + +def process_file(file, args): + file_path = os.path.join(args.input_folder, file) + if os.path.exists(args.output_folder): + shutil.rmtree(args.output_folder) + os.mkdir(args.output_folder) + with open(file_path, 'r') as fp: + for i, line in enumerate(fp): + tokenized = [token.split('\t')[1] for token in obeliks.run(line).split('\n') if len(token.split('\t')) > 1] + dictionary = paragraph_to_svala(tokenized) + with open(os.path.join(args.output_folder, file + str(i+1) + '.json'), 'w') as wf: + json.dump(dictionary, wf, ensure_ascii=False, indent="") + +def main(args): + for file in os.listdir(args.input_folder): + process_file(file, args) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') + parser.add_argument('--input_folder', default='data/txt/input', + help='input file in (gz or xml currently). If none, then just database is loaded') + parser.add_argument('--output_folder', default='data/txt/output', + help='input file in (gz or xml currently). If none, then just database is loaded') + args = parser.parse_args() + + start = time.time() + main(args) + logging.info("TIME: {}".format(time.time() - start))