From aae67a307081410da52c9c851fc6d806aa8a7e0e Mon Sep 17 00:00:00 2001
From: Cyprian Laskowski <cyp@cjvt.si>
Date: Wed, 13 Jan 2021 19:07:35 +0100
Subject: [PATCH] IssueID #1487: fixed basic bugs

---
 scripts/merge_dictionaries.py | 5 +++--
 scripts/pipeline2.py          | 8 ++++----
 scripts/split_tei.py          | 4 ++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/scripts/merge_dictionaries.py b/scripts/merge_dictionaries.py
index 085f69d..b6e9117 100644
--- a/scripts/merge_dictionaries.py
+++ b/scripts/merge_dictionaries.py
@@ -1,4 +1,5 @@
 import argparse
+import re
 import lxml.etree as lxml
 
 arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
@@ -14,11 +15,11 @@ def get_entries(input_file_name):
     return list(lxml.parse(input_file_name).getroot())
 
 entries = get_entries(single_file_name) + get_entries(multiple_file_name)
-sort(entries, key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('xml:id')).group(1)))
+entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
 
 root = lxml.Element('dictionary')
 for entry in entries:
-    del entry.attrib['xml:id']
+    del entry.attrib['sid']
     root.append(entry)
 tree = lxml.ElementTree(root)
 tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
diff --git a/scripts/pipeline2.py b/scripts/pipeline2.py
index 6c1f730..37e0292 100644
--- a/scripts/pipeline2.py
+++ b/scripts/pipeline2.py
@@ -43,7 +43,7 @@ def run_pipeline(input_tei_file_name, output_lexicon_file_name, output_structure
 
 def split_tei_input(input_file_name, single_file_name, multiple_file_name):
     print('Splitting TEI input file ...')
-    split_command = ' '.join(['python', TEI_SPLIT_SCRIPT_NAME, '--input', input_file_name, '--single', single_file_name, '--multiple', multiple_file_name])
+    split_command = ' '.join(['python', TEI_SPLIT_SCRIPT_NAME, '-infile', input_file_name, '-single', single_file_name, '-multiple', multiple_file_name])
     os.system(split_command)
 
 def run_mwe_extraction(structure_file_name, tei_file_name, mwe_csv_file_name):
@@ -70,12 +70,12 @@ def run_structure_creation(input_file_name, tei_file_name, output_file_name):
     
 def run_dictionary_conversion(tei_file_name, xml_file_name):
     print('Converting to dictionary xml format ...')
-    convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name, '--keepids', 'true'])
+    convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name])
     os.system(convert_command)
 
-def merge_dictionaries(single_file_name, multiple_file_name, joint_file_name):
+def merge_dictionaries(single_file_name, multiple_file_name, output_file_name):
     print('Merging dictionary files ...')
-    merge_command = ' '.join(['python', DICTIONARY_MERGE_SCRIPT_NAME, '-single', single_file_name, '--multiple', multiple_file_name, '--joint', joint_file_name])
+    merge_command = ' '.join(['python', DICTIONARY_MERGE_SCRIPT_NAME, '-single', single_file_name, '-multiple', multiple_file_name, '-outfile', output_file_name])
     os.system(merge_command)
 
 def validate_dictionary(dictionary_file_name):
diff --git a/scripts/split_tei.py b/scripts/split_tei.py
index 8c3211d..d42599e 100644
--- a/scripts/split_tei.py
+++ b/scripts/split_tei.py
@@ -19,7 +19,7 @@ def count_tokens(paragraph):
 
 tree = lxml.parse(input_file_name)
 root = tree.getroot()
-paragraphs = xpath_find('.//tei:p')
+paragraphs = xpath_find(root, './/tei:p')
 for paragraph in paragraphs:
     if (count_tokens(paragraph) > 1):
         paragraph.getparent().remove(paragraph)
@@ -27,7 +27,7 @@ tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
 
 tree = lxml.parse(input_file_name)
 root = tree.getroot()
-paragraphs = xpath_find('.//tei:p')
+paragraphs = xpath_find(root, './/tei:p')
 for paragraph in paragraphs:
     if (count_tokens(paragraph) == 1):
         paragraph.getparent().remove(paragraph)