Redmine #1835: made input structure specification xml into parameter
This commit is contained in:
parent
86e2b12782
commit
3c38cdbcae
36
README.md
36
README.md
|
@ -40,21 +40,20 @@ $ python scripts/process.py -mode strings_to_parse -infile /tmp/strings.txt -out
|
||||||
|
|
||||||
The input should be a TEI XML file (in the same particular format as
|
The input should be a TEI XML file (in the same particular format as
|
||||||
the output of strings_to_parse) and an xml file of structure
|
the output of strings_to_parse) and an xml file of structure
|
||||||
specifications (the CJVT structures.xml file, supplemented with
|
specifications. It first splits the TEI file into two files, one with
|
||||||
temporary new structures, if needed). It first splits the TEI file
|
the single-component units and the other with the multiple-component
|
||||||
into two files, one with the single-component units and the other with
|
units. For each, it then assigns each unit to a syntactic structure
|
||||||
the multiple-component units. For each, it then assigns each unit to a
|
from the DDD database and converts the output into CJVT dictionary XML
|
||||||
syntactic structure from the DDD database and converts the output into
|
format. For the single-component units, this is pretty trivial, but
|
||||||
CJVT dictionary XML format. For the single-component units, this is
|
for multiple-component units it is more involved, and includes two
|
||||||
pretty trivial, but for multiple-component units it is more involved,
|
runs of the MWE extraction script
|
||||||
and includes two runs of the MWE extraction script
|
|
||||||
[wani.py](https://gitea.cjvt.si/ozbolt/luscenje_struktur), generating
|
[wani.py](https://gitea.cjvt.si/ozbolt/luscenje_struktur), generating
|
||||||
missing structures in between. At the end, the single-component and
|
missing structures in between. At the end, the single-component and
|
||||||
multiple-component dictionary files are merged into one dictionary
|
multiple-component dictionary files are merged into one dictionary
|
||||||
file. Example:
|
file. Example:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ python scripts/process.py -mode parse_to_dictionary -infile /tmp/parsed.xml -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml
|
$ python scripts/process.py -mode parse_to_dictionary -infile /tmp/parsed.xml -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml
|
||||||
```
|
```
|
||||||
|
|
||||||
### strings_to_dictionary
|
### strings_to_dictionary
|
||||||
|
@ -64,7 +63,7 @@ Combines strings_to_parse in parse_to_dictionary into one call
|
||||||
between). Example:
|
between). Example:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ python scripts/process.py -mode strings_to_dictionary -infile /tmp/strings.txt -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml
|
$ python scripts/process.py -mode strings_to_dictionary -infile /tmp/strings.txt -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -outstructs /tmp/structures_new.xml
|
||||||
```
|
```
|
||||||
|
|
||||||
### all
|
### all
|
||||||
|
@ -73,26 +72,23 @@ Same as strings_to_dictionary, but also validates the dictionary and
|
||||||
structures outputs, just in case.
|
structures outputs, just in case.
|
||||||
|
|
||||||
```
|
```
|
||||||
$ python scripts/process.py -mode all -infile /tmp/strings.txt -outfile /tmp/dictionary.xml -structures /tmp/structures_new.xml
|
$ python scripts/process.py -mode all -infile /tmp/strings.txt -instructs /tmp/structures_old.xml -outfile /tmp/dictionary.xml -outstructs /tmp/structures_new.xml
|
||||||
```
|
```
|
||||||
|
|
||||||
## REST API
|
## REST API
|
||||||
|
|
||||||
The package provides a REST API with endpoints roughly mirroring the
|
The package provides a REST API with endpoints roughly mirroring the
|
||||||
process.py modes. For the calls accepting strings as input, GET calls
|
process.py modes. For most calls, POST is needed, so that input
|
||||||
are also supported for single-string input. For the calls resulting in
|
structures can be easily provided. If processing resulted in temporary
|
||||||
dictionaries, the results include both the dictionary entries and the
|
new structures, their number is recorded in @new_structures.
|
||||||
structures they use. If processing resulted in temporary new
|
|
||||||
structures, their number is recorded in @new_structures.
|
|
||||||
|
|
||||||
Example curl calls:
|
Example curl calls:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ curl -k https://proc1.cjvt.si/structures/strings_to_parse?string=velika%20miza
|
$ curl -k https://proc1.cjvt.si/structures/strings_to_parse?string=velika%20miza
|
||||||
$ curl -k -X POST -F file=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_parse
|
$ curl -k -X POST -F strings=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_parse
|
||||||
$ curl -k -X POST -F file=@/tmp/parse.xml https://proc1.cjvt.si/structures/parse_to_dictionary
|
$ curl -k -X POST -F parsed=@/tmp/parse.xml -F structures=@/tmp/structures.xml https://proc1.cjvt.si/structures/parse_to_dictionary
|
||||||
$ curl -k https://proc1.cjvt.si/structures/strings_to_dictionary?string=velika%20miza
|
$ curl -k -X POST -F strings=@/tmp/strings.txt -F structures=@/tmp/structures.xml https://proc1.cjvt.si/structures/strings_to_dictionary
|
||||||
$ curl -k -X POST -F file=@/tmp/strings.txt https://proc1.cjvt.si/structures/strings_to_dictionary
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Note
|
## Note
|
||||||
|
|
|
@ -29,7 +29,7 @@ def strings_to_parse():
|
||||||
string_file.write(string + '\n')
|
string_file.write(string + '\n')
|
||||||
string_file.close()
|
string_file.close()
|
||||||
elif (request.method == 'POST'):
|
elif (request.method == 'POST'):
|
||||||
file_data = request.files['file']
|
file_data = request.files['strings']
|
||||||
file_data.save(string_file_name)
|
file_data.save(string_file_name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -49,19 +49,22 @@ def parse_to_dictionary():
|
||||||
tmp_directory = tempfile.mkdtemp()
|
tmp_directory = tempfile.mkdtemp()
|
||||||
parsed_file_name = tmp_directory + '/input_parsed.xml'
|
parsed_file_name = tmp_directory + '/input_parsed.xml'
|
||||||
dictionary_file_name = tmp_directory + '/output_dictionary.xml'
|
dictionary_file_name = tmp_directory + '/output_dictionary.xml'
|
||||||
structure_file_name = tmp_directory + '/output_structures.xml'
|
input_structure_file_name = tmp_directory + '/input_structures.xml'
|
||||||
|
output_structure_file_name = tmp_directory + '/output_structures.xml'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
file_data = request.files['file']
|
parsed_file_data = request.files['parsed']
|
||||||
file_data.save(parsed_file_name)
|
parsed_file_data.save(parsed_file_name)
|
||||||
|
structure_file_data = request.files['structures']
|
||||||
|
structure_file_data.save(input_structure_file_name)
|
||||||
|
|
||||||
runner.parse_to_dictionary(parsed_file_name, dictionary_file_name, structure_file_name)
|
runner.parse_to_dictionary(parsed_file_name, dictionary_file_name, input_structure_file_name, output_structure_file_name)
|
||||||
root = lxml.Element('response')
|
root = lxml.Element('response')
|
||||||
|
|
||||||
dictionary_root = lxml.parse(dictionary_file_name).getroot()
|
dictionary_root = lxml.parse(dictionary_file_name).getroot()
|
||||||
root.append(dictionary_root)
|
root.append(dictionary_root)
|
||||||
structure_root = lxml.parse(structure_file_name).getroot()
|
structure_root = lxml.parse(output_structure_file_name).getroot()
|
||||||
new_structure_count = len(structure_root.xpath('.//syntactic_structure[@tempId]'))
|
new_structure_count = len(structure_root.xpath('.//syntactic_structure[@tempId]'))
|
||||||
root.set('new_structures', str(new_structure_count))
|
root.set('new_structures', str(new_structure_count))
|
||||||
structure_ids = set(dictionary_root.xpath('.//lexicalUnit/@structure_id'))
|
structure_ids = set(dictionary_root.xpath('.//lexicalUnit/@structure_id'))
|
||||||
|
@ -79,30 +82,28 @@ def parse_to_dictionary():
|
||||||
return Response(message, mimetype='text/xml')
|
return Response(message, mimetype='text/xml')
|
||||||
|
|
||||||
|
|
||||||
@app.route(api_prefix + '/strings_to_dictionary', methods=['GET', 'POST'])
|
@app.route(api_prefix + '/strings_to_dictionary', methods=['POST'])
|
||||||
def strings_to_dictionary():
|
def strings_to_dictionary():
|
||||||
|
|
||||||
tmp_directory = tempfile.mkdtemp()
|
tmp_directory = tempfile.mkdtemp()
|
||||||
string_file_name = tmp_directory + '/input_string.txt'
|
string_file_name = tmp_directory + '/input_string.txt'
|
||||||
dictionary_file_name = tmp_directory + '/output_dictionary.xml'
|
dictionary_file_name = tmp_directory + '/output_dictionary.xml'
|
||||||
structure_file_name = tmp_directory + '/output_structures.xml'
|
input_structure_file_name = tmp_directory + '/input_structures.xml'
|
||||||
|
output_structure_file_name = tmp_directory + '/output_structures.xml'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
if (request.method == 'GET'):
|
string_file_data = request.files['strings']
|
||||||
string = request.args.get('string')
|
string_file_data.save(string_file_name)
|
||||||
with open(string_file_name, 'w') as string_file:
|
structure_file_data = request.files['structures']
|
||||||
string_file.write(string + '\n')
|
structure_file_data.save(input_structure_file_name)
|
||||||
elif (request.method == 'POST'):
|
|
||||||
file_data = request.files['file']
|
|
||||||
file_data.save(string_file_name)
|
|
||||||
|
|
||||||
runner.strings_to_dictionary(string_file_name, dictionary_file_name, structure_file_name)
|
runner.strings_to_dictionary(string_file_name, dictionary_file_name, input_structure_file_name, output_structure_file_name)
|
||||||
root = lxml.Element('response')
|
root = lxml.Element('response')
|
||||||
|
|
||||||
dictionary_root = lxml.parse(dictionary_file_name).getroot()
|
dictionary_root = lxml.parse(dictionary_file_name).getroot()
|
||||||
root.append(dictionary_root)
|
root.append(dictionary_root)
|
||||||
structure_root = lxml.parse(structure_file_name).getroot()
|
structure_root = lxml.parse(output_structure_file_name).getroot()
|
||||||
new_structure_count = len(structure_root.xpath('.//syntactic_structure[@tempId]'))
|
new_structure_count = len(structure_root.xpath('.//syntactic_structure[@tempId]'))
|
||||||
root.set('new_structures', str(new_structure_count))
|
root.set('new_structures', str(new_structure_count))
|
||||||
structure_ids = set(dictionary_root.xpath('.//lexicalUnit/@structure_id'))
|
structure_ids = set(dictionary_root.xpath('.//lexicalUnit/@structure_id'))
|
||||||
|
|
|
@ -13,7 +13,7 @@ FILE_MAP = {'strings-list': 'strings.txt',
|
||||||
'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml',
|
'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml',
|
||||||
'mwes-1': 'mwes1.csv',
|
'mwes-1': 'mwes1.csv',
|
||||||
'mwes-2': 'mwes2.csv',
|
'mwes-2': 'mwes2.csv',
|
||||||
'structures-old': 'structures.xml',
|
'structures-old': 'structures_old.xml',
|
||||||
'structures-new': 'structures_new.xml',
|
'structures-new': 'structures_new.xml',
|
||||||
'dictionary-single': 'dictionary_single.xml',
|
'dictionary-single': 'dictionary_single.xml',
|
||||||
'dictionary-multiple': 'dictionary_multiple.xml',
|
'dictionary-multiple': 'dictionary_multiple.xml',
|
||||||
|
|
|
@ -25,24 +25,26 @@ class Runner:
|
||||||
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
||||||
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||||
|
|
||||||
def run_all(self, input_file_name, output_file_name, structure_file_name):
|
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||||
pipeline.import_file(input_file_name, 'strings-list')
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||||
self._strings_to_parse_sequence(pipeline)
|
self._strings_to_parse_sequence(pipeline)
|
||||||
self._parse_to_dictionary_sequence(pipeline)
|
self._parse_to_dictionary_sequence(pipeline)
|
||||||
pipeline.do_validate_structures()
|
pipeline.do_validate_structures()
|
||||||
pipeline.export_file(structure_file_name, 'structures-new')
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
||||||
pipeline.do_validate_dictionary()
|
pipeline.do_validate_dictionary()
|
||||||
pipeline.export_file(output_file_name, 'dictionary')
|
pipeline.export_file(output_file_name, 'dictionary')
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def strings_to_dictionary(self, input_file_name, output_file_name, structure_file_name):
|
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory, self.nlp)
|
pipeline = Pipeline(self.resource_directory, self.nlp)
|
||||||
pipeline.import_file(input_file_name, 'strings-list')
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||||
self._strings_to_parse_sequence(pipeline)
|
self._strings_to_parse_sequence(pipeline)
|
||||||
self._parse_to_dictionary_sequence(pipeline)
|
self._parse_to_dictionary_sequence(pipeline)
|
||||||
pipeline.export_file(output_file_name, 'dictionary')
|
pipeline.export_file(output_file_name, 'dictionary')
|
||||||
pipeline.export_file(structure_file_name, 'structures-new')
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def strings_to_parse(self, input_file_name, output_file_name):
|
def strings_to_parse(self, input_file_name, output_file_name):
|
||||||
|
@ -52,12 +54,13 @@ class Runner:
|
||||||
pipeline.export_file(output_file_name, 'tei-initial')
|
pipeline.export_file(output_file_name, 'tei-initial')
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def parse_to_dictionary(self, input_file_name, output_file_name, structure_file_name):
|
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||||
pipeline = Pipeline(self.resource_directory)
|
pipeline = Pipeline(self.resource_directory)
|
||||||
pipeline.import_file(input_file_name, 'tei-initial')
|
pipeline.import_file(input_file_name, 'tei-initial')
|
||||||
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||||
self._parse_to_dictionary_sequence(pipeline)
|
self._parse_to_dictionary_sequence(pipeline)
|
||||||
pipeline.export_file(output_file_name, 'dictionary')
|
pipeline.export_file(output_file_name, 'dictionary')
|
||||||
pipeline.export_file(structure_file_name, 'structures-new')
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def validate_structures(self, input_file_name):
|
def validate_structures(self, input_file_name):
|
||||||
|
|
|
@ -10,25 +10,27 @@ if (__name__ == '__main__'):
|
||||||
arg_parser.add_argument('-mode', type=str, help='Mode')
|
arg_parser.add_argument('-mode', type=str, help='Mode')
|
||||||
arg_parser.add_argument('-infile', type=str, help='Input file')
|
arg_parser.add_argument('-infile', type=str, help='Input file')
|
||||||
arg_parser.add_argument('-outfile', type=str, help='Output file')
|
arg_parser.add_argument('-outfile', type=str, help='Output file')
|
||||||
arg_parser.add_argument('-structures', type=str, help='Updated structure file')
|
arg_parser.add_argument('-instructs', type=str, help='Input structure file')
|
||||||
|
arg_parser.add_argument('-outstructs', type=str, help='Output structure file')
|
||||||
arguments = arg_parser.parse_args()
|
arguments = arg_parser.parse_args()
|
||||||
|
|
||||||
mode = arguments.mode
|
mode = arguments.mode
|
||||||
input_file_name = arguments.infile
|
input_file_name = arguments.infile
|
||||||
output_file_name = arguments.outfile
|
output_file_name = arguments.outfile
|
||||||
structure_file_name = arguments.structures
|
input_structure_file_name = arguments.instructs
|
||||||
|
output_structure_file_name = arguments.outstructs
|
||||||
|
|
||||||
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
||||||
runner = Runner(resource_directory, nlp_needed)
|
runner = Runner(resource_directory, nlp_needed)
|
||||||
if (mode == 'strings_to_parse'):
|
if (mode == 'strings_to_parse'):
|
||||||
runner.strings_to_parse(input_file_name, output_file_name)
|
runner.strings_to_parse(input_file_name, output_file_name)
|
||||||
elif (mode == 'strings_to_dictionary'):
|
elif (mode == 'strings_to_dictionary'):
|
||||||
runner.strings_to_dictionary(input_file_name, output_file_name, structure_file_name)
|
runner.strings_to_dictionary(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name)
|
||||||
elif (mode == 'parse_to_dictionary'):
|
elif (mode == 'parse_to_dictionary'):
|
||||||
runner.parse_to_dictionary(input_file_name, output_file_name, structure_file_name)
|
runner.parse_to_dictionary(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name)
|
||||||
elif (mode == 'validate_structures'):
|
elif (mode == 'validate_structures'):
|
||||||
runner.validate_structures(input_file_name)
|
runner.validate_structures(input_file_name)
|
||||||
elif (mode == 'validate_dictionary'):
|
elif (mode == 'validate_dictionary'):
|
||||||
runner.validate_dictionary(input_file_name)
|
runner.validate_dictionary(input_file_name)
|
||||||
elif (mode == 'all'):
|
elif (mode == 'all'):
|
||||||
runner.run_all(input_file_name, output_file_name, structure_file_name)
|
runner.run_all(input_file_name, output_file_name, input_structure_file_name, output_structure_file_name)
|
||||||
|
|
|
@ -11,7 +11,6 @@ git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
|
||||||
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
||||||
git clone git@gitea.cjvt.si:generic/data_admin.git
|
git clone git@gitea.cjvt.si:generic/data_admin.git
|
||||||
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
||||||
scp -P 3022 proc1.cjvt.si:/net/nas/resources/miscellaneous/structures/structures.xml .
|
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
## prepare python environment
|
## prepare python environment
|
||||||
|
@ -35,5 +34,4 @@ ln -s ../lib/nova_slovnica/resources/dict.xml .
|
||||||
ln -s ../lib/data_admin/resources/structures.xsd .
|
ln -s ../lib/data_admin/resources/structures.xsd .
|
||||||
ln -s ../lib/xml_schemas/resources/schema/inventory.xsd .
|
ln -s ../lib/xml_schemas/resources/schema/inventory.xsd .
|
||||||
ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd .
|
ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd .
|
||||||
mv ../lib/structures.xml .
|
|
||||||
cd ..
|
cd ..
|
||||||
|
|
Loading…
Reference in New Issue
Block a user