diff --git a/src/formatter.py b/src/formatter.py index c526229..f1c384d 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -63,6 +63,9 @@ class OutNoStatFormatter(Formatter): def group(self): return True + + def __str__(self): + return "out-no-stat" class AllFormatter(Formatter): def header_repeat(self): @@ -81,6 +84,9 @@ class AllFormatter(Formatter): def group(self): return False + def __str__(self): + return "all" + class StatsFormatter(Formatter): def additional_init(self): self.stats = None @@ -160,6 +166,9 @@ class StatsFormatter(Formatter): def group(self): return True + + def __str__(self): + return "stat" class OutFormatter(Formatter): def additional_init(self): @@ -187,4 +196,7 @@ class OutFormatter(Formatter): self.f2.set_structure(structure) def new_match(self, match): - self.f2.new_match(match) \ No newline at end of file + self.f2.new_match(match) + + def __str__(self): + return "out" \ No newline at end of file diff --git a/src/loader.py b/src/loader.py index 267ff8d..ff1e662 100644 --- a/src/loader.py +++ b/src/loader.py @@ -3,8 +3,7 @@ import logging import re import sys -from tqdm import tqdm - +from progress_bar import progress from word import Word @@ -23,7 +22,6 @@ def load_files(args): def load_xml(filename): - logging.info("LOADING XML: {}".format(filename)) with open(filename, 'r') as fp: content = fp.read() @@ -35,7 +33,7 @@ def load_xml(filename): def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): words = {} sentences = list(et.iter('s')) - for sentence in tqdm(sentences, desc="load-text"): + for sentence in progress(sentences, "load-text", infile=True): for w in sentence.iter("w"): words[w.get('id')] = Word(w, do_msd_translate) for pc in sentence.iter(pc_tag): diff --git a/src/match_store.py b/src/match_store.py index 35c5e6a..fdacdd8 100644 --- a/src/match_store.py +++ b/src/match_store.py @@ -2,11 +2,7 @@ from collections import defaultdict from match import StructureMatch from representation_assigner import RepresentationAssigner - -try: - from tqdm import tqdm -except ImportError: - tqdm = lambda x: x +from progress_bar import progress class MatchStore: def __init__(self, args): @@ -35,7 +31,7 @@ class MatchStore: yield sm def set_representations(self, word_renderer): - for _1, sm in tqdm(self.data.items()): + for _1, sm in progress(self.data.items(), "representations"): RepresentationAssigner.set_representations(sm, word_renderer) def determine_colocation_dispersions(self): diff --git a/src/progress_bar.py b/src/progress_bar.py new file mode 100644 index 0000000..75c7121 --- /dev/null +++ b/src/progress_bar.py @@ -0,0 +1,42 @@ +try: + from tqdm import tqdm +except ImportError: + tqdm = None + + +class Progress: + def __init__(self): + self.infile = False + + + def __call__(self, iterable, description, infile=False, outfile=False): + show_progress = True + if infile and not self.infile: + show_progress = False + elif outfile and self.infile: + show_progress = False + + if not show_progress: + yield from iterable + return + + if tqdm is None: + iterlist = list(iterable) + proc = -1 + for n, el in enumerate(iterlist): + nxt_proc = int(n / len(iterlist) * 100) + if nxt_proc > proc: + print("\r{}: {:02d}% ({}/{})".format(description, nxt_proc, n, len(iterlist)), end="") + proc = nxt_proc + yield el + print("") + else: + yield from tqdm(iterable, desc=description) + + + def init(self, args): + self.infile = not args.hide_inner_progress + + +progress = Progress() + diff --git a/src/wani.py b/src/wani.py index 6a1b86a..85755eb 100644 --- a/src/wani.py +++ b/src/wani.py @@ -9,11 +9,7 @@ import subprocess import concurrent.futures import tempfile -try: - from tqdm import tqdm -except ImportError: - tqdm = lambda x: x - +from progress_bar import progress from word import Word from syntactic_structure import build_structures from match_store import MatchStore @@ -22,11 +18,10 @@ from writer import Writer from loader import load_files - def match_file(words, structures): matches = {s: [] for s in structures} - for s in structures: + for s in progress(structures, "matching", infile=True): for w in words: mhere = s.match(w) for match in mhere: @@ -77,7 +72,7 @@ def main(args): word_stats.add_words(words) else: - for words in tqdm(load_files(args)): + for words in progress(load_files(args), "files", outfile=True): matches = match_file(words, structures) # just save to temporary file, used for children of a parallel process # MUST NOT have more than one file @@ -151,9 +146,11 @@ if __name__ == '__main__': parser.add_argument('--match-to-file', help='Do not use!') parser.add_argument('--pickled-structures', help='Do not use!', action='store_true') + parser.add_argument('--hide-inner-progress', help='Do not use!', action='store_true') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) + progress.init(args) start = time.time() main(args) diff --git a/src/word_stats.py b/src/word_stats.py index fea4072..d0191e3 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -1,5 +1,8 @@ from collections import defaultdict, Counter +from progress_bar import progress + + class WordStats: def __init__(self, lemma_features): self.raw_data = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) @@ -22,7 +25,7 @@ class WordStats: def generate_renders(self): num_words = defaultdict(int) - for lemma, ld in self.raw_data.items(): + for lemma, ld in progress(self.raw_data.items(), "lemma-render"): self.rendered_words[lemma] = {} freq_words = defaultdict(int) common_msd = "*" * 10 diff --git a/src/writer.py b/src/writer.py index a4d5ec1..13451da 100644 --- a/src/writer.py +++ b/src/writer.py @@ -1,4 +1,5 @@ import logging +from progress_bar import progress from formatter import OutFormatter, OutNoStatFormatter, AllFormatter, StatsFormatter @@ -122,7 +123,7 @@ class Writer: fp = fp_open() self.write_header(fp) - for s in structures: + for s in progress(structures, "writing:{}".format(self.formatter)): if self.multiple_output: fp = fp_open(s.id) self.write_header(fp)