Modified readme.md + Removed obligatory sloleks_db + Added frequency_limit and sorted parameters in recalculate_statistics.py

2020-09-02 10:53:45 +02:00
parent 41952738ed
commit 1b0e6a27eb
6 changed files with 4841 additions and 15 deletions
@@ -1,4 +1,5 @@
 *.xml
+!collocation-structures.xml
 *.tbl
 *.csv
 *.pdf
@@ -10,7 +10,62 @@ Priporocam: pypy3 paket za hitrejse poganjanje.

 Primer uporabe: `python3 wani.py ssj500k.xml Kolokacije_strukture.xml  izhod.csv`

-## Instructions for running on GF
+# About
+
+This script was developed to extract collocations from text in TEI format. Collocations are extracted and presented based on rules provided in structure file (example in `collocation-structures.xml`).
+
+# Setup
+
+Script may be run via python3 or pypy3. We suggest usage of virtual environments.
+
+```bash
+pip install -r requirements.txt
+```
+
+
+# Running
+
+```bash
+python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
+python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
+```
+
+## Most important optional parameters
+
+### --sloleks_db
+To use this sqlalchemy has to be installed as well.
+PATH TO SLOLEKS DB
+
+### --collocation_sentence_map_dest
+../data/collocation_sentence_mapper 
+
+### --db
+This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
+
+We suggest to put this sqlite file in RAM for faster execution. To do this follow these instructions:
+
+```bash
+sudo mkdir /mnt/tmp
+sudo mount -t tmpfs tmpfs /mnt/tmp
+```
+
+If running on big corpuses (ie. Gigafida have database in RAM):
+```bash
+sudo mkdir /mnt/tmp
+sudo mount -t tmpfs tmpfs /mnt/tmp
+sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
+```
+
+Pass path to specific file when running `wani.py`. For example:
+```bash
+python3 wani.py ... --db /mnt/tmp/mysql-wani-ssj500k ...
+```
+
+### --multiple-output
+Used when we want multiple output files (one file per structure_id).
+
+
+## Instructions for running on big files (ie. Gigafida)

 Suggested running with saved mysql file in tmpfs. Instructions:

@@ -21,6 +76,7 @@ sudo mount -t tmpfs tmpfs /mnt/tmp

 If running on big corpuses (ie. Gigafida have database in RAM):
 ```bash
+sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
 ```
@@ -175,7 +175,9 @@ def main(args):
        with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
            original_text, stats = get_new_stats(rf)
            freq_pos = original_text[0].index('Frequency')
+            if args.frequency_limit > 1:
                original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
+            if args.sorted:
                if len(original_text) > 1:
                    original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
                else:
@@ -190,6 +192,8 @@ if __name__ == '__main__':
    parser.add_argument('output',
                        help='Path to folder that contains all input files.')
    parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
+    parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
+    parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')

    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr)
@@ -71,9 +71,7 @@ class WordFormAnyCR(ComponentRepresentation):
            agreements_matched = [agr.match(word_msd) for agr in self.agreement]

            # in case all agreements do not match try to get data from sloleks and change properly
-            if not all(agreements_matched):
-                if sloleks_db is None:
-                    raise Exception('sloleks_db not properly setup!')
+            if sloleks_db is not None and not all(agreements_matched):
                for i, agr in enumerate(self.agreement):
                    if not agr.match(word_msd):
                        msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
@@ -142,9 +140,7 @@ class WordFormMsdCR(WordFormAnyCR):
            super().add_word(word)

    def _render(self, sloleks_db=None):
-        if len(self.words) == 0:
-            if sloleks_db is None:
-                raise Exception('sloleks_db not properly setup!')
+        if len(self.words) == 0 and sloleks_db is not None:
            msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
            if msd is not None:
                self.words.append(WordDummy(msd, lemma, text))
@@ -80,8 +80,12 @@ def main(args):

    # figure out representations!
    if args.out or args.out_no_stat:
+        if args.sloleks_db is not None:
            sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
+        else:
+            sloleks_db = None
        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
+        if args.sloleks_db is not None:
            sloleks_db.close()

    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
@@ -102,7 +106,7 @@ if __name__ == '__main__':
                        help='Structures definitions in xml file')
    parser.add_argument('input',
                        help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
-    parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
+    parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
    parser.add_argument('--out',
                        help='Classic output file')
    parser.add_argument('--out-no-stat',