adding min-frequency option
This commit is contained in:
parent
93d7af3aea
commit
de6c73980e
19
wani.py
19
wani.py
|
@ -856,7 +856,7 @@ class Writer:
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
if len(rows[0]) <= self.sort_by:
|
if len(rows[0]) <= self.sort_by:
|
||||||
print("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])), file=sys.stderr)
|
logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])))
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -888,14 +888,15 @@ class Writer:
|
||||||
to_write.append("") # not yet implemented...
|
to_write.append("") # not yet implemented...
|
||||||
|
|
||||||
if self.group:
|
if self.group:
|
||||||
if colocation_ids.is_written(cid):
|
if colocation_ids.should_write(cid):
|
||||||
continue
|
|
||||||
else:
|
|
||||||
to_write.append(colocation_ids.num(cid))
|
to_write.append(colocation_ids.num(cid))
|
||||||
colocation_ids.set_written(cid)
|
colocation_ids.set_written(cid)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
rows.append(to_write)
|
rows.append(to_write)
|
||||||
|
|
||||||
|
if len(rows) > 0:
|
||||||
rows = self.sorted_rows(rows)
|
rows = self.sorted_rows(rows)
|
||||||
file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
|
file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
|
||||||
file_handler.flush()
|
file_handler.flush()
|
||||||
|
@ -923,7 +924,6 @@ class Writer:
|
||||||
self.write_header(fp)
|
self.write_header(fp)
|
||||||
|
|
||||||
sid_matches = matches[s.id]
|
sid_matches = matches[s.id]
|
||||||
if len(sid_matches) > 0:
|
|
||||||
self.write_out_worker(fp, sid_matches, s.id, s.components, colocation_ids)
|
self.write_out_worker(fp, sid_matches, s.id, s.components, colocation_ids)
|
||||||
|
|
||||||
if self.multiple_output:
|
if self.multiple_output:
|
||||||
|
@ -936,6 +936,7 @@ class Writer:
|
||||||
class ColocationIds:
|
class ColocationIds:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.data = {}
|
self.data = {}
|
||||||
|
self.min_frequency = args.group
|
||||||
|
|
||||||
def add_match(self, key):
|
def add_match(self, key):
|
||||||
if key in self.data:
|
if key in self.data:
|
||||||
|
@ -946,8 +947,8 @@ class ColocationIds:
|
||||||
def get(self, key, n):
|
def get(self, key, n):
|
||||||
return self.data[key][n]
|
return self.data[key][n]
|
||||||
|
|
||||||
def is_written(self, key):
|
def should_write(self, key):
|
||||||
return self.get(key, 2)
|
return self.get(key, 1) >= self.min_frequency and not self.get(key, 2)
|
||||||
|
|
||||||
def num(self, key):
|
def num(self, key):
|
||||||
return str(self.get(key, 1))
|
return str(self.get(key, 1))
|
||||||
|
@ -1048,8 +1049,8 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
|
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
|
||||||
parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true')
|
parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true')
|
||||||
parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')
|
parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')
|
||||||
parser.add_argument('--group', help='Group collocations with same collocation ID', action='store_true')
|
parser.add_argument('--group', help='Group collocations with same collocation ID', type=int, default=0, const=1, nargs='?')
|
||||||
parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info")
|
parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?')
|
||||||
parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
|
parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
|
||||||
parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')
|
parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user