diff --git a/.gitignore b/.gitignore index e02e0ba..8b055ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.xml +!collocation-structures.xml *.tbl *.csv *.pdf diff --git a/README.md b/README.md index 86b56b3..762db3c 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,62 @@ Priporocam: pypy3 paket za hitrejse poganjanje. Primer uporabe: `python3 wani.py ssj500k.xml Kolokacije_strukture.xml izhod.csv` -## Instructions for running on GF +# About + +This script was developed to extract collocations from text in TEI format. Collocations are extracted and presented based on rules provided in structure file (example in `collocation-structures.xml`). + +# Setup + +Script may be run via python3 or pypy3. We suggest usage of virtual environments. + +```bash +pip install -r requirements.txt +``` + + +# Running + +```bash +python3 wani.py --out --sloleks_db +python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k +``` + +## Most important optional parameters + +### --sloleks_db +To use this sqlalchemy has to be installed as well. +PATH TO SLOLEKS DB + +### --collocation_sentence_map_dest +../data/collocation_sentence_mapper + +### --db +This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified. + +We suggest to put this sqlite file in RAM for faster execution. To do this follow these instructions: + +```bash +sudo mkdir /mnt/tmp +sudo mount -t tmpfs tmpfs /mnt/tmp +``` + +If running on big corpuses (ie. Gigafida have database in RAM): +```bash +sudo mkdir /mnt/tmp +sudo mount -t tmpfs tmpfs /mnt/tmp +sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp +``` + +Pass path to specific file when running `wani.py`. For example: +```bash +python3 wani.py ... --db /mnt/tmp/mysql-wani-ssj500k ... +``` + +### --multiple-output +Used when we want multiple output files (one file per structure_id). + + +## Instructions for running on big files (ie. Gigafida) Suggested running with saved mysql file in tmpfs. Instructions: @@ -21,6 +76,7 @@ sudo mount -t tmpfs tmpfs /mnt/tmp If running on big corpuses (ie. Gigafida have database in RAM): ```bash +sudo mkdir /mnt/tmp sudo mount -t tmpfs tmpfs /mnt/tmp sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp ``` \ No newline at end of file diff --git a/collocation-structures.xml b/collocation-structures.xml new file mode 100644 index 0000000..d4132f2 --- /dev/null +++ b/collocation-structures.xml @@ -0,0 +1,4765 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/recalculate_statistics.py b/scripts/recalculate_statistics.py index 12c434a..d565586 100644 --- a/scripts/recalculate_statistics.py +++ b/scripts/recalculate_statistics.py @@ -175,11 +175,13 @@ def main(args): with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf: original_text, stats = get_new_stats(rf) freq_pos = original_text[0].index('Frequency') - original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10] - if len(original_text) > 1: - original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos])) - else: - original_text = [original_text[0]] + if args.frequency_limit > 1: + original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10] + if args.sorted: + if len(original_text) > 1: + original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos])) + else: + original_text = [original_text[0]] write_new_stats(wf, original_text, stats, file_name, word_order) if __name__ == '__main__': @@ -190,6 +192,8 @@ if __name__ == '__main__': parser.add_argument('output', help='Path to folder that contains all input files.') parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.') + parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.') + parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.') args = parser.parse_args() logging.basicConfig(stream=sys.stderr) diff --git a/src/representation.py b/src/representation.py index f205d62..3d6c2a9 100644 --- a/src/representation.py +++ b/src/representation.py @@ -71,9 +71,7 @@ class WordFormAnyCR(ComponentRepresentation): agreements_matched = [agr.match(word_msd) for agr in self.agreement] # in case all agreements do not match try to get data from sloleks and change properly - if not all(agreements_matched): - if sloleks_db is None: - raise Exception('sloleks_db not properly setup!') + if sloleks_db is not None and not all(agreements_matched): for i, agr in enumerate(self.agreement): if not agr.match(word_msd): msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd) @@ -142,9 +140,7 @@ class WordFormMsdCR(WordFormAnyCR): super().add_word(word) def _render(self, sloleks_db=None): - if len(self.words) == 0: - if sloleks_db is None: - raise Exception('sloleks_db not properly setup!') + if len(self.words) == 0 and sloleks_db is not None: msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data) if msd is not None: self.words.append(WordDummy(msd, lemma, text)) diff --git a/src/wani.py b/src/wani.py index b7f5b15..df6d1fe 100644 --- a/src/wani.py +++ b/src/wani.py @@ -80,9 +80,13 @@ def main(args): # figure out representations! if args.out or args.out_no_stat: - sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks) + if args.sloleks_db is not None: + sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks) + else: + sloleks_db = None match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db) - sloleks_db.close() + if args.sloleks_db is not None: + sloleks_db.close() Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) @@ -102,7 +106,7 @@ if __name__ == '__main__': help='Structures definitions in xml file') parser.add_argument('input', help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*') - parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials') + parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials') parser.add_argument('--out', help='Classic output file') parser.add_argument('--out-no-stat',