Redmine #1835: made input structure specification xml into parameter
This commit is contained in:
parent
86e2b12782
commit
3c38cdbcae
36
README.md
36
README.md
|
@ -40,21 +40,20 @@ $ python scripts/process.py -mode strings_to_parse -infile /tmp/strings.txt -out
|
|||
|
||||
The input should be a TEI XML file (in the same particular format as
|
||||
the output of strings_to_parse) and an xml file of structure
|
||||
specifications (the CJVT structures.xml file, supplemented with
|
||||
temporary new structures, if needed). It first splits the TEI file
|
||||
into two files, one with the single-component units and the other with
|
||||
the multiple-component units. For each, it then assigns each unit to a
|
||||
syntactic structure from the DDD database and converts the output into
|
||||
CJVT dictionary XML format. For the single-component units, this is
|
||||
pretty trivial, but for multiple-component units it is more involved,
|
||||
and includes two runs of the MWE extraction script
|
||||
specifications. It first splits the TEI file into two files, one with
|
||||
the single-component units and the other with the multiple-component
|
||||
units. For each, it then assigns each unit to a syntactic structure
|
||||
from the DDD database and converts the output into CJVT dictionary XML
|
||||
format. For the single-component units, this is pretty trivial, but
|
||||
for multiple-component units it is more involved, and includes two
|
||||
runs of the MWE extraction script
|
||||
[wani.py](https://gitea.cjvt.si/ozbolt/luscenje_struktur), generating
|
||||
missing structures in between. At the end, the single-component and
|
||||
multiple-component dictionary files are merged into one dictionary
|
||||
file. Example:
|
||||
|
||||
```
|
||||
$ python scripts/process.py -mode parse_to_dictionary -infile /tmp/parsed.xml -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml
|
||||
$ python scripts/process.py -mode parse_to_dictionary -infile /tmp/parsed.xml -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml
|
||||
```
|
||||
|
||||
### strings_to_dictionary
|
||||
|
@ -64,7 +63,7 @@ Combines strings_to_parse in parse_to_dictionary into one call
|
|||
between). Example:
|
||||
|
||||
```
|
||||
$ python scripts/process.py -mode strings_to_dictionary -infile /tmp/strings.txt -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml
|
||||
$ python scripts/process.py -mode strings_to_dictionary -infile /tmp/strings.txt -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -outstructs /tmp/structures_new.xml
|
||||
```
|
||||
|
||||
### all
|
||||
|
@ -73,26 +72,23 @@ Same as strings_to_dictionary, but also validates the dictionary and
|
|||
structures outputs, just in case.
|
||||
|
||||
```
|
||||
$ python scripts/process.py -mode all -infile /tmp/strings.txt -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml
|
||||
$ python scripts/process.py -mode all -infile /tmp/strings.txt -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -outstructs /tmp/structures_new.xml
|
||||
```
|
||||
|
||||
## REST API
|
||||
|
||||
The package provides a REST API with endpoints roughly mirroring the
|
||||
process.py modes. For the calls accepting strings as input, GET calls
|
||||
are also supported for single-string input. For the calls resulting in
|
||||
dictionaries, the results include both the dictionary entries and the
|
||||
structures they use. If processing resulted in temporary new
|
||||
structures, their number is recorded in @new_structures.
|
||||
process.py modes. For most calls, POST is needed, so that input
|
||||
structures can be easily provided. If processing resulted in temporary
|
||||
new structures, their number is recorded in @new_structures.
|
||||
|
||||
Example curl calls:
|
||||
|
||||
```
|
||||
$ curl -k https://proc1.cjvt.si/structures/strings_to_parse?string=velika%20miza
|
||||
$ curl -k -X POST -F file=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_parse
|
||||
$ curl -k -X POST -F file=@/tmp/parse.xml https://proc1.cjvt.si/structures/parse_to_dictionary
|
||||
$ curl -k https://proc1.cjvt.si/structures/strings_to_dictionary?string=velika%20miza
|
||||
$ curl -k -X POST -F file=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_dictionary
|
||||
$ curl -k -X POST -F strings=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_parse
|
||||
$ curl -k -X POST -F parsed=@/tmp/parse.xml -F structures=@/tmp/structures.xml https://proc1.cjvt.si/structures/parse_to_dictionary
|
||||
$ curl -k -X POST -F strings=@/tmp/strings.txt -F structures=@/tmp/structures.xml https://proc1.cjvt.si/structures/strings_to_dictionary
|
||||
```
|
||||
|
||||
## Note
|
||||
|
|
|
@ -29,7 +29,7 @@ def strings_to_parse():
|
|||
string_file.write(string + '\n')
|
||||
string_file.close()
|
||||
elif (request.method == 'POST'):
|
||||
file_data = request.files['file']
|
||||
file_data = request.files['strings']
|
||||
file_data.save(string_file_name)
|
||||
|
||||
try:
|
||||
|
@ -49,19 +49,22 @@ def parse_to_dictionary():
|
|||
tmp_directory = tempfile.mkdtemp()
|
||||
parsed_file_name = tmp_directory + '/input_parsed.xml'
|
||||
dictionary_file_name = tmp_directory + '/output_dictionary.xml'
|
||||
structure_file_name = tmp_directory + '/output_structures.xml'
|
||||
input_structure_file_name = tmp_directory + '/input_structures.xml'
|
||||
output_structure_file_name = tmp_directory + '/output_structures.xml'
|
||||
|
||||
try:
|
||||
|
||||
file_data = request.files['file']
|
||||
file_data.save(parsed_file_name)
|
||||
parsed_file_data = request.files['parsed']
|
||||
parsed_file_data.save(parsed_file_name)
|
||||
structure_file_data = request.files['structures']
|
||||
structure_file_data.save(input_structure_file_name)
|
||||
|
||||
runner.parse_to_dictionary(parsed_file_name, dictionary_file_name, structure_file_name)
|
||||
runner.parse_to_dictionary(parsed_file_name, dictionary_file_name, input_structure_file_name, output_structure_file_name)
|
||||
root = lxml.Element('response')
|
||||
|
||||
dictionary_root = lxml.parse(dictionary_file_name).getroot()
|
||||
root.append(dictionary_root)
|
||||
structure_root = lxml.parse(structure_file_name).getroot()
|
||||
structure_root = lxml.parse(output_structure_file_name).getroot()
|
||||
new_structure_count = len(structure_root.xpath('.//syntactic_structure[@tempId]'))
|
||||
root.set('new_structures', str(new_structure_count))
|
||||
structure_ids = set(dictionary_root.xpath('.//lexicalUnit/@structure_id'))
|
||||
|
@ -79,30 +82,28 @@ def parse_to_dictionary():
|
|||
return Response(message, mimetype='text/xml')
|
||||
|
||||
|
||||
@app.route(api_prefix + '/strings_to_dictionary', methods=['GET', 'POST'])
|
||||
@app.route(api_prefix + '/strings_to_dictionary', methods=['POST'])
|
||||
def strings_to_dictionary():
|
||||
|
||||
tmp_directory = tempfile.mkdtemp()
|
||||
string_file_name = tmp_directory + '/input_string.txt'
|
||||
dictionary_file_name = tmp_directory + '/output_dictionary.xml'
|
||||
structure_file_name = tmp_directory + '/output_structures.xml'
|
||||
input_structure_file_name = tmp_directory + '/input_structures.xml'
|
||||
output_structure_file_name = tmp_directory + '/output_structures.xml'
|
||||
|
||||
try:
|
||||
|
||||
if (request.method == 'GET'):
|
||||
string = request.args.get('string')
|
||||
with open(string_file_name, 'w') as string_file:
|
||||
string_file.write(string + '\n')
|
||||
elif (request.method == 'POST'):
|
||||
file_data = request.files['file']
|
||||
file_data.save(string_file_name)
|
||||
string_file_data = request.files['strings']
|
||||
string_file_data.save(string_file_name)
|
||||
structure_file_data = request.files['structures']
|
||||
structure_file_data.save(input_structure_file_name)
|
||||
|
||||
runner.strings_to_dictionary(string_file_name, dictionary_file_name, structure_file_name)
|
||||
runner.strings_to_dictionary(string_file_name, dictionary_file_name, input_structure_file_name, output_structure_file_name)
|
||||
root = lxml.Element('response')
|
||||
|
||||
dictionary_root = lxml.parse(dictionary_file_name).getroot()
|
||||
root.append(dictionary_root)
|
||||
structure_root = lxml.parse(structure_file_name).getroot()
|
||||
structure_root = lxml.parse(output_structure_file_name).getroot()
|
||||
new_structure_count = len(structure_root.xpath('.//syntactic_structure[@tempId]'))
|
||||
root.set('new_structures', str(new_structure_count))
|
||||
structure_ids = set(dictionary_root.xpath('.//lexicalUnit/@structure_id'))
|
||||
|
|
|
@ -13,7 +13,7 @@ FILE_MAP = {'strings-list': 'strings.txt',
|
|||
'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml',
|
||||
'mwes-1': 'mwes1.csv',
|
||||
'mwes-2': 'mwes2.csv',
|
||||
'structures-old': 'structures.xml',
|
||||
'structures-old': 'structures_old.xml',
|
||||
'structures-new': 'structures_new.xml',
|
||||
'dictionary-single': 'dictionary_single.xml',
|
||||
'dictionary-multiple': 'dictionary_multiple.xml',
|
||||
|
|
|
@ -25,24 +25,26 @@ class Runner:
|
|||
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
||||
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||
|
||||
def run_all(self, input_file_name, output_file_name, structure_file_name):
|
||||
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||
pipeline.import_file(input_file_name, 'strings-list')
|
||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||
self._strings_to_parse_sequence(pipeline)
|
||||
self._parse_to_dictionary_sequence(pipeline)
|
||||
pipeline.do_validate_structures()
|
||||
pipeline.export_file(structure_file_name, 'structures-new')
|
||||
pipeline.export_file(output_structure_file_name, 'structures-new')
|
||||
pipeline.do_validate_dictionary()
|
||||
pipeline.export_file(output_file_name, 'dictionary')
|
||||
pipeline.cleanup()
|
||||
|
||||
def strings_to_dictionary(self, input_file_name, output_file_name, structure_file_name):
|
||||
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||
pipeline.import_file(input_file_name, 'strings-list')
|
||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||
self._strings_to_parse_sequence(pipeline)
|
||||
self._parse_to_dictionary_sequence(pipeline)
|
||||
pipeline.export_file(output_file_name, 'dictionary')
|
||||
pipeline.export_file(structure_file_name, 'structures-new')
|
||||
pipeline.export_file(output_structure_file_name, 'structures-new')
|
||||
pipeline.cleanup()
|
||||
|
||||
def strings_to_parse(self, input_file_name, output_file_name):
|
||||
|
@ -52,12 +54,13 @@ class Runner:
|
|||
pipeline.export_file(output_file_name, 'tei-initial')
|
||||
pipeline.cleanup()
|
||||
|
||||
def parse_to_dictionary(self, input_file_name, output_file_name, structure_file_name):
|
||||
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||
pipeline = Pipeline(self.resource_directory)
|
||||
pipeline.import_file(input_file_name, 'tei-initial')
|
||||
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||
self._parse_to_dictionary_sequence(pipeline)
|
||||
pipeline.export_file(output_file_name, 'dictionary')
|
||||
pipeline.export_file(structure_file_name, 'structures-new')
|
||||
pipeline.export_file(output_structure_file_name, 'structures-new')
|
||||
pipeline.cleanup()
|
||||
|
||||
def validate_structures(self, input_file_name):
|
||||
|
|
|
@ -10,25 +10,27 @@ if (__name__ == '__main__'):
|
|||
arg_parser.add_argument('-mode', type=str, help='Mode')
|
||||
arg_parser.add_argument('-infile', type=str, help='Input file')
|
||||
arg_parser.add_argument('-outfile', type=str, help='Output file')
|
||||
arg_parser.add_argument('-structures', type=str, help='Updated structure file')
|
||||
arg_parser.add_argument('-instructs', type=str, help='Input structure file')
|
||||
arg_parser.add_argument('-outstructs', type=str, help='Output structure file')
|
||||
arguments = arg_parser.parse_args()
|
||||
|
||||
mode = arguments.mode
|
||||
input_file_name = arguments.infile
|
||||
output_file_name = arguments.outfile
|
||||
structure_file_name = arguments.structures
|
||||
input_structure_file_name = arguments.instructs
|
||||
output_structure_file_name = arguments.outstructs
|
||||
|
||||
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
||||
runner = Runner(resource_directory, nlp_needed)
|
||||
if (mode == 'strings_to_parse'):
|
||||
runner.strings_to_parse(input_file_name, output_file_name)
|
||||
elif (mode == 'strings_to_dictionary'):
|
||||
runner.strings_to_dictionary(input_file_name, output_file_name, structure_file_name)
|
||||
runner.strings_to_dictionary(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name)
|
||||
elif (mode == 'parse_to_dictionary'):
|
||||
runner.parse_to_dictionary(input_file_name, output_file_name, structure_file_name)
|
||||
runner.parse_to_dictionary(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name)
|
||||
elif (mode == 'validate_structures'):
|
||||
runner.validate_structures(input_file_name)
|
||||
elif (mode == 'validate_dictionary'):
|
||||
runner.validate_dictionary(input_file_name)
|
||||
elif (mode == 'all'):
|
||||
runner.run_all(input_file_name, output_file_name, structure_file_name)
|
||||
runner.run_all(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name)
|
||||
|
|
|
@ -11,7 +11,6 @@ git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
|
|||
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
||||
git clone git@gitea.cjvt.si:generic/data_admin.git
|
||||
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
||||
scp -P 3022 proc1.cjvt.si:/net/nas/resources/miscellaneous/structures/structures.xml .
|
||||
cd ..
|
||||
|
||||
## prepare python environment
|
||||
|
@ -35,5 +34,4 @@ ln -s ../lib/nova_slovnica/resources/dict.xml .
|
|||
ln -s ../lib/data_admin/resources/structures.xsd .
|
||||
ln -s ../lib/xml_schemas/resources/schema/inventory.xsd .
|
||||
ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd .
|
||||
mv ../lib/structures.xml .
|
||||
cd ..
|
||||
|
|
Loading…
Reference in New Issue
Block a user