Modified readme.md + Removed obligatory sloleks_db + Added frequency_limit and sorted parameters in recalculate_statistics.py

2020-09-02 10:53:45 +02:00
parent 41952738ed
commit 1b0e6a27eb
6 changed files with 4841 additions and 15 deletions
@@ -1,4 +1,5 @@
 *.xml
 !collocation-structures.xml
 *.tbl
 *.csv
 *.pdf
@@ -10,7 +10,62 @@ Priporocam: pypy3 paket za hitrejse poganjanje.
 Primer uporabe: `python3 wani.py ssj500k.xml Kolokacije_strukture.xml  izhod.csv`
-## Instructions for running on GF
+# About
 This script was developed to extract collocations from text in TEI format. Collocations are extracted and presented based on rules provided in structure file (example in `collocation-structures.xml`).
 # Setup
 Script may be run via python3 or pypy3. We suggest usage of virtual environments.
 ```bash
 pip install -r requirements.txt
 ```
 # Running
 ```bash
 python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
 python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
 ```
 ## Most important optional parameters
 ### --sloleks_db
 To use this sqlalchemy has to be installed as well.
 PATH TO SLOLEKS DB
 ### --collocation_sentence_map_dest
 ../data/collocation_sentence_mapper 
 ### --db
 This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
 We suggest to put this sqlite file in RAM for faster execution. To do this follow these instructions:
 ```bash
 sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 ```
 If running on big corpuses (ie. Gigafida have database in RAM):
 ```bash
 sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
 ```
 Pass path to specific file when running `wani.py`. For example:
 ```bash
 python3 wani.py ... --db /mnt/tmp/mysql-wani-ssj500k ...
 ```
 ### --multiple-output
 Used when we want multiple output files (one file per structure_id).
 ## Instructions for running on big files (ie. Gigafida)
 Suggested running with saved mysql file in tmpfs. Instructions:
@@ -21,6 +76,7 @@ sudo mount -t tmpfs tmpfs /mnt/tmp
 If running on big corpuses (ie. Gigafida have database in RAM):
 ```bash
 sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
 ```
@@ -175,7 +175,9 @@ def main(args):
        with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
            original_text, stats = get_new_stats(rf)
            freq_pos = original_text[0].index('Frequency')
            if args.frequency_limit > 1:
                original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
            if args.sorted:
                if len(original_text) > 1:
                    original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
                else:
@@ -190,6 +192,8 @@ if __name__ == '__main__':
    parser.add_argument('output',
                        help='Path to folder that contains all input files.')
    parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
    parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
    parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')
    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr)
@@ -71,9 +71,7 @@ class WordFormAnyCR(ComponentRepresentation):
            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
            # in case all agreements do not match try to get data from sloleks and change properly
-            if not all(agreements_matched):
+            if sloleks_db is not None and not all(agreements_matched):
                if sloleks_db is None:
                    raise Exception('sloleks_db not properly setup!')
                for i, agr in enumerate(self.agreement):
                    if not agr.match(word_msd):
                        msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
@@ -142,9 +140,7 @@ class WordFormMsdCR(WordFormAnyCR):
            super().add_word(word)
    def _render(self, sloleks_db=None):
-        if len(self.words) == 0:
+        if len(self.words) == 0 and sloleks_db is not None:
            if sloleks_db is None:
                raise Exception('sloleks_db not properly setup!')
            msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
            if msd is not None:
                self.words.append(WordDummy(msd, lemma, text))
@@ -80,8 +80,12 @@ def main(args):
    # figure out representations!
    if args.out or args.out_no_stat:
        if args.sloleks_db is not None:
            sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
        else:
            sloleks_db = None
        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
        if args.sloleks_db is not None:
            sloleks_db.close()
    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
@@ -102,7 +106,7 @@ if __name__ == '__main__':
                        help='Structures definitions in xml file')
    parser.add_argument('input',
                        help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
-    parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
+    parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
    parser.add_argument('--out',
                        help='Classic output file')
    parser.add_argument('--out-no-stat',