diff --git a/tools/main.py b/tools/main.py index 5fb8fc8..eeb21e6 100644 --- a/tools/main.py +++ b/tools/main.py @@ -17,7 +17,11 @@ if __name__ == "__main__": # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" kres_dir = "../data/kres_example/" for kres_file in os.listdir(kres_dir): + out_file = "" res_dict = parser.parse_tei(join(kres_dir, kres_file)) for _, sentence in res_dict.items(): - parser.to_conll09(sentence) + out_file += parser.to_conll_2009_full(sentence) + with open(join(kres_dir, kres_file + ".tsv"), "wb+") as fp: + fp.write(out_file.encode("utf-8")) + fp.close() print("end parsing kres") diff --git a/tools/parser/__pycache__/__init__.cpython-37.pyc b/tools/parser/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..8b49a1e Binary files /dev/null and b/tools/parser/__pycache__/__init__.cpython-37.pyc differ diff --git a/tools/parser/__pycache__/msdmap.cpython-37.pyc b/tools/parser/__pycache__/msdmap.cpython-37.pyc new file mode 100644 index 0000000..82d62dc Binary files /dev/null and b/tools/parser/__pycache__/msdmap.cpython-37.pyc differ diff --git a/tools/parser/__pycache__/parser.cpython-37.pyc b/tools/parser/__pycache__/parser.cpython-37.pyc new file mode 100644 index 0000000..732b752 Binary files /dev/null and b/tools/parser/__pycache__/parser.cpython-37.pyc differ diff --git a/tools/parser/parser.py b/tools/parser/parser.py index dacc8c5..2409089 100644 --- a/tools/parser/parser.py +++ b/tools/parser/parser.py @@ -90,6 +90,7 @@ def parse_tei(filepath): parse_links(s) if guess_corpus == "KRES" else None ) } + fp.close() return res_dict @@ -108,7 +109,7 @@ def parse_links(s_el): return res_links -def to_conll09(sentence_entry): +def to_conll_2009_SRL(sentence_entry): def fillpred(pos, feat): if False: @@ -154,3 +155,16 @@ def to_conll09(sentence_entry): out_str += "\n" print(out_str) return out_str + + +def to_conll_2009_full(sentence_entry): + out_str = "" + for token in sentence_entry["tokens"]: + t_id = token[1] + # 1 3 + out_str += "{}\t{}\n".format( + t_id, # id + token[2], # form + ) + out_str += "\n" + return out_str \ No newline at end of file