Modified readme.md + Removed obligatory sloleks_db + Added frequency_limit and sorted parameters in recalculate_statistics.py
This commit is contained in:
parent
41952738ed
commit
1b0e6a27eb
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,4 +1,5 @@
|
|||
*.xml
|
||||
!collocation-structures.xml
|
||||
*.tbl
|
||||
*.csv
|
||||
*.pdf
|
||||
|
|
58
README.md
58
README.md
|
@ -10,7 +10,62 @@ Priporocam: pypy3 paket za hitrejse poganjanje.
|
|||
|
||||
Primer uporabe: `python3 wani.py ssj500k.xml Kolokacije_strukture.xml izhod.csv`
|
||||
|
||||
## Instructions for running on GF
|
||||
# About
|
||||
|
||||
This script was developed to extract collocations from text in TEI format. Collocations are extracted and presented based on rules provided in structure file (example in `collocation-structures.xml`).
|
||||
|
||||
# Setup
|
||||
|
||||
Script may be run via python3 or pypy3. We suggest usage of virtual environments.
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
||||
# Running
|
||||
|
||||
```bash
|
||||
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
|
||||
python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
|
||||
```
|
||||
|
||||
## Most important optional parameters
|
||||
|
||||
### --sloleks_db
|
||||
To use this sqlalchemy has to be installed as well.
|
||||
PATH TO SLOLEKS DB
|
||||
|
||||
### --collocation_sentence_map_dest
|
||||
../data/collocation_sentence_mapper
|
||||
|
||||
### --db
|
||||
This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
|
||||
|
||||
We suggest to put this sqlite file in RAM for faster execution. To do this follow these instructions:
|
||||
|
||||
```bash
|
||||
sudo mkdir /mnt/tmp
|
||||
sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||
```
|
||||
|
||||
If running on big corpuses (ie. Gigafida have database in RAM):
|
||||
```bash
|
||||
sudo mkdir /mnt/tmp
|
||||
sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||
sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
|
||||
```
|
||||
|
||||
Pass path to specific file when running `wani.py`. For example:
|
||||
```bash
|
||||
python3 wani.py ... --db /mnt/tmp/mysql-wani-ssj500k ...
|
||||
```
|
||||
|
||||
### --multiple-output
|
||||
Used when we want multiple output files (one file per structure_id).
|
||||
|
||||
|
||||
## Instructions for running on big files (ie. Gigafida)
|
||||
|
||||
Suggested running with saved mysql file in tmpfs. Instructions:
|
||||
|
||||
|
@ -21,6 +76,7 @@ sudo mount -t tmpfs tmpfs /mnt/tmp
|
|||
|
||||
If running on big corpuses (ie. Gigafida have database in RAM):
|
||||
```bash
|
||||
sudo mkdir /mnt/tmp
|
||||
sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||
sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
|
||||
```
|
4765
collocation-structures.xml
Normal file
4765
collocation-structures.xml
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -175,11 +175,13 @@ def main(args):
|
|||
with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
|
||||
original_text, stats = get_new_stats(rf)
|
||||
freq_pos = original_text[0].index('Frequency')
|
||||
original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
|
||||
if len(original_text) > 1:
|
||||
original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
|
||||
else:
|
||||
original_text = [original_text[0]]
|
||||
if args.frequency_limit > 1:
|
||||
original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
|
||||
if args.sorted:
|
||||
if len(original_text) > 1:
|
||||
original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
|
||||
else:
|
||||
original_text = [original_text[0]]
|
||||
write_new_stats(wf, original_text, stats, file_name, word_order)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -190,6 +192,8 @@ if __name__ == '__main__':
|
|||
parser.add_argument('output',
|
||||
help='Path to folder that contains all input files.')
|
||||
parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
|
||||
parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
|
||||
parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')
|
||||
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(stream=sys.stderr)
|
||||
|
|
|
@ -71,9 +71,7 @@ class WordFormAnyCR(ComponentRepresentation):
|
|||
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
||||
|
||||
# in case all agreements do not match try to get data from sloleks and change properly
|
||||
if not all(agreements_matched):
|
||||
if sloleks_db is None:
|
||||
raise Exception('sloleks_db not properly setup!')
|
||||
if sloleks_db is not None and not all(agreements_matched):
|
||||
for i, agr in enumerate(self.agreement):
|
||||
if not agr.match(word_msd):
|
||||
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
|
||||
|
@ -142,9 +140,7 @@ class WordFormMsdCR(WordFormAnyCR):
|
|||
super().add_word(word)
|
||||
|
||||
def _render(self, sloleks_db=None):
|
||||
if len(self.words) == 0:
|
||||
if sloleks_db is None:
|
||||
raise Exception('sloleks_db not properly setup!')
|
||||
if len(self.words) == 0 and sloleks_db is not None:
|
||||
msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
|
||||
if msd is not None:
|
||||
self.words.append(WordDummy(msd, lemma, text))
|
||||
|
|
10
src/wani.py
10
src/wani.py
|
@ -80,9 +80,13 @@ def main(args):
|
|||
|
||||
# figure out representations!
|
||||
if args.out or args.out_no_stat:
|
||||
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
|
||||
if args.sloleks_db is not None:
|
||||
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
|
||||
else:
|
||||
sloleks_db = None
|
||||
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
|
||||
sloleks_db.close()
|
||||
if args.sloleks_db is not None:
|
||||
sloleks_db.close()
|
||||
|
||||
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||
structures, match_store)
|
||||
|
@ -102,7 +106,7 @@ if __name__ == '__main__':
|
|||
help='Structures definitions in xml file')
|
||||
parser.add_argument('input',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
||||
parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
|
||||
parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
|
||||
parser.add_argument('--out',
|
||||
help='Classic output file')
|
||||
parser.add_argument('--out-no-stat',
|
||||
|
|
Loading…
Reference in New Issue
Block a user