Modified readme.md + Removed obligatory sloleks_db + Added frequency_limit and sorted parameters in recalculate_statistics.py
This commit is contained in:
parent
41952738ed
commit
1b0e6a27eb
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,4 +1,5 @@
|
||||||
*.xml
|
*.xml
|
||||||
|
!collocation-structures.xml
|
||||||
*.tbl
|
*.tbl
|
||||||
*.csv
|
*.csv
|
||||||
*.pdf
|
*.pdf
|
||||||
|
|
58
README.md
58
README.md
|
@ -10,7 +10,62 @@ Priporocam: pypy3 paket za hitrejse poganjanje.
|
||||||
|
|
||||||
Primer uporabe: `python3 wani.py ssj500k.xml Kolokacije_strukture.xml izhod.csv`
|
Primer uporabe: `python3 wani.py ssj500k.xml Kolokacije_strukture.xml izhod.csv`
|
||||||
|
|
||||||
## Instructions for running on GF
|
# About
|
||||||
|
|
||||||
|
This script was developed to extract collocations from text in TEI format. Collocations are extracted and presented based on rules provided in structure file (example in `collocation-structures.xml`).
|
||||||
|
|
||||||
|
# Setup
|
||||||
|
|
||||||
|
Script may be run via python3 or pypy3. We suggest usage of virtual environments.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
# Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
|
||||||
|
python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
|
||||||
|
```
|
||||||
|
|
||||||
|
## Most important optional parameters
|
||||||
|
|
||||||
|
### --sloleks_db
|
||||||
|
To use this sqlalchemy has to be installed as well.
|
||||||
|
PATH TO SLOLEKS DB
|
||||||
|
|
||||||
|
### --collocation_sentence_map_dest
|
||||||
|
../data/collocation_sentence_mapper
|
||||||
|
|
||||||
|
### --db
|
||||||
|
This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
|
||||||
|
|
||||||
|
We suggest to put this sqlite file in RAM for faster execution. To do this follow these instructions:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo mkdir /mnt/tmp
|
||||||
|
sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||||
|
```
|
||||||
|
|
||||||
|
If running on big corpuses (ie. Gigafida have database in RAM):
|
||||||
|
```bash
|
||||||
|
sudo mkdir /mnt/tmp
|
||||||
|
sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||||
|
sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
|
||||||
|
```
|
||||||
|
|
||||||
|
Pass path to specific file when running `wani.py`. For example:
|
||||||
|
```bash
|
||||||
|
python3 wani.py ... --db /mnt/tmp/mysql-wani-ssj500k ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### --multiple-output
|
||||||
|
Used when we want multiple output files (one file per structure_id).
|
||||||
|
|
||||||
|
|
||||||
|
## Instructions for running on big files (ie. Gigafida)
|
||||||
|
|
||||||
Suggested running with saved mysql file in tmpfs. Instructions:
|
Suggested running with saved mysql file in tmpfs. Instructions:
|
||||||
|
|
||||||
|
@ -21,6 +76,7 @@ sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||||
|
|
||||||
If running on big corpuses (ie. Gigafida have database in RAM):
|
If running on big corpuses (ie. Gigafida have database in RAM):
|
||||||
```bash
|
```bash
|
||||||
|
sudo mkdir /mnt/tmp
|
||||||
sudo mount -t tmpfs tmpfs /mnt/tmp
|
sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||||
sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
|
sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
|
||||||
```
|
```
|
4765
collocation-structures.xml
Normal file
4765
collocation-structures.xml
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -175,11 +175,13 @@ def main(args):
|
||||||
with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
|
with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
|
||||||
original_text, stats = get_new_stats(rf)
|
original_text, stats = get_new_stats(rf)
|
||||||
freq_pos = original_text[0].index('Frequency')
|
freq_pos = original_text[0].index('Frequency')
|
||||||
original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
|
if args.frequency_limit > 1:
|
||||||
if len(original_text) > 1:
|
original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
|
||||||
original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
|
if args.sorted:
|
||||||
else:
|
if len(original_text) > 1:
|
||||||
original_text = [original_text[0]]
|
original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
|
||||||
|
else:
|
||||||
|
original_text = [original_text[0]]
|
||||||
write_new_stats(wf, original_text, stats, file_name, word_order)
|
write_new_stats(wf, original_text, stats, file_name, word_order)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -190,6 +192,8 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('output',
|
parser.add_argument('output',
|
||||||
help='Path to folder that contains all input files.')
|
help='Path to folder that contains all input files.')
|
||||||
parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
|
parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
|
||||||
|
parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
|
||||||
|
parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
logging.basicConfig(stream=sys.stderr)
|
logging.basicConfig(stream=sys.stderr)
|
||||||
|
|
|
@ -71,9 +71,7 @@ class WordFormAnyCR(ComponentRepresentation):
|
||||||
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
||||||
|
|
||||||
# in case all agreements do not match try to get data from sloleks and change properly
|
# in case all agreements do not match try to get data from sloleks and change properly
|
||||||
if not all(agreements_matched):
|
if sloleks_db is not None and not all(agreements_matched):
|
||||||
if sloleks_db is None:
|
|
||||||
raise Exception('sloleks_db not properly setup!')
|
|
||||||
for i, agr in enumerate(self.agreement):
|
for i, agr in enumerate(self.agreement):
|
||||||
if not agr.match(word_msd):
|
if not agr.match(word_msd):
|
||||||
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
|
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
|
||||||
|
@ -142,9 +140,7 @@ class WordFormMsdCR(WordFormAnyCR):
|
||||||
super().add_word(word)
|
super().add_word(word)
|
||||||
|
|
||||||
def _render(self, sloleks_db=None):
|
def _render(self, sloleks_db=None):
|
||||||
if len(self.words) == 0:
|
if len(self.words) == 0 and sloleks_db is not None:
|
||||||
if sloleks_db is None:
|
|
||||||
raise Exception('sloleks_db not properly setup!')
|
|
||||||
msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
|
msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
|
||||||
if msd is not None:
|
if msd is not None:
|
||||||
self.words.append(WordDummy(msd, lemma, text))
|
self.words.append(WordDummy(msd, lemma, text))
|
||||||
|
|
10
src/wani.py
10
src/wani.py
|
@ -80,9 +80,13 @@ def main(args):
|
||||||
|
|
||||||
# figure out representations!
|
# figure out representations!
|
||||||
if args.out or args.out_no_stat:
|
if args.out or args.out_no_stat:
|
||||||
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
|
if args.sloleks_db is not None:
|
||||||
|
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
|
||||||
|
else:
|
||||||
|
sloleks_db = None
|
||||||
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
|
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
|
||||||
sloleks_db.close()
|
if args.sloleks_db is not None:
|
||||||
|
sloleks_db.close()
|
||||||
|
|
||||||
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||||
structures, match_store)
|
structures, match_store)
|
||||||
|
@ -102,7 +106,7 @@ if __name__ == '__main__':
|
||||||
help='Structures definitions in xml file')
|
help='Structures definitions in xml file')
|
||||||
parser.add_argument('input',
|
parser.add_argument('input',
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
||||||
parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
|
parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
|
||||||
parser.add_argument('--out',
|
parser.add_argument('--out',
|
||||||
help='Classic output file')
|
help='Classic output file')
|
||||||
parser.add_argument('--out-no-stat',
|
parser.add_argument('--out-no-stat',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user