Implementing new output formats, all and normal, no more lemma_only and stuff
Still need to implement representation in normal form.
This commit is contained in:
		
							parent
							
								
									b4b93022fe
								
							
						
					
					
						commit
						401698409e
					
				
							
								
								
									
										66
									
								
								wani.py
									
									
									
									
									
								
							
							
						
						
									
										66
									
								
								wani.py
									
									
									
									
									
								
							@ -388,6 +388,7 @@ class Component:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            for feature in representation:
 | 
					            for feature in representation:
 | 
				
			||||||
                f = ComponentRepresentation.new(dict(feature.attrib))
 | 
					                f = ComponentRepresentation.new(dict(feature.attrib))
 | 
				
			||||||
 | 
					                print(f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                if type(f) is None:
 | 
					                if type(f) is None:
 | 
				
			||||||
                    logging.warning("Unknown representation in component {}, skipping...".format(self.idx), file=sys.stderr)
 | 
					                    logging.warning("Unknown representation in component {}, skipping...".format(self.idx), file=sys.stderr)
 | 
				
			||||||
@ -812,49 +813,48 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
 | 
				
			|||||||
    return list(words.values())
 | 
					    return list(words.values())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Writer:
 | 
					class Writer:
 | 
				
			||||||
    def __init__(self, args):
 | 
					    @staticmethod
 | 
				
			||||||
        self.group = args.group
 | 
					    def make_output_writer(args):
 | 
				
			||||||
        self.lemma_only = args.lemma_only
 | 
					        return Writer(False, args.output, args.multiple_output, int(args.sort_by), args.sort_reversed)
 | 
				
			||||||
        self.without_rep = args.without_rep
 | 
					 | 
				
			||||||
        self.output_file = args.output
 | 
					 | 
				
			||||||
        self.multiple_output = args.multiple_output
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
        self.sort_by = int(args.sort_by)
 | 
					    @staticmethod
 | 
				
			||||||
        self.sort_order = args.sort_reversed
 | 
					    def make_all_writer(args):
 | 
				
			||||||
 | 
					        return Writer(True, args.all, False, -1, False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, all, filename, multiple_output, sort_by, sort_reversed):
 | 
				
			||||||
 | 
					        self.all = all
 | 
				
			||||||
 | 
					        self.output_file = filename
 | 
				
			||||||
 | 
					        self.multiple_output = multiple_output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.sort_by = sort_by
 | 
				
			||||||
 | 
					        self.sort_order = sort_reversed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def header(self):
 | 
					    def header(self):
 | 
				
			||||||
        cols = ["Lemma"]
 | 
					        cols = ["Lemma"]
 | 
				
			||||||
        if not self.lemma_only:
 | 
					        if self.all:
 | 
				
			||||||
            cols = ["Token_ID", "Word_form"] + cols + ["Msd"]
 | 
					            cols = ["Token_ID", "Word_form"] + cols + ["Msd"]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
        if not self.without_rep:
 | 
					 | 
				
			||||||
            cols.append("Representative_form")
 | 
					            cols.append("Representative_form")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        assert(len(cols) == self.length())
 | 
					        assert(len(cols) == self.length())
 | 
				
			||||||
        cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols]
 | 
					        cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols]
 | 
				
			||||||
        cols = ["Structure_ID"] + cols + ["Collocation_ID"]
 | 
					        cols = ["Structure_ID"] + cols + ["Colocation_ID"]
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        if not self.without_rep:
 | 
					        if not self.all:
 | 
				
			||||||
            cols.append("Joint_representative_form")
 | 
					            cols += ["Joint_representative_form", "Frequency"]
 | 
				
			||||||
        if self.group:
 | 
					 | 
				
			||||||
            cols.append("Frequency")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return cols
 | 
					        return cols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def length(self):
 | 
					    def length(self):
 | 
				
			||||||
        return 1 + 3 * int(not self.lemma_only) + int(not self.without_rep)
 | 
					        return 4 if self.all else 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_word(self, word):
 | 
					    def from_word(self, word):
 | 
				
			||||||
        if word is None:
 | 
					        if word is None:
 | 
				
			||||||
            return "" * self.length()
 | 
					            return "" * self.length()
 | 
				
			||||||
 | 
					        elif self.all:
 | 
				
			||||||
 | 
					            return [word.id, word.text, word.lemma, word.msd]
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            cols = [word.lemma]
 | 
					            return [word.lemma, "REP?"]
 | 
				
			||||||
            if not self.lemma_only:
 | 
					 | 
				
			||||||
                cols = [word.id, word.text] + cols + [word.msd]
 | 
					 | 
				
			||||||
            if not self.without_rep:
 | 
					 | 
				
			||||||
                cols += [""] #not yet implemented...
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return cols
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    def sorted_rows(self, rows):
 | 
					    def sorted_rows(self, rows):
 | 
				
			||||||
        if self.sort_by < 0 or len(rows) < 2:
 | 
					        if self.sort_by < 0 or len(rows) < 2:
 | 
				
			||||||
@ -889,10 +889,7 @@ class Writer:
 | 
				
			|||||||
            to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write))) 
 | 
					            to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write))) 
 | 
				
			||||||
            to_write = [structure_id] + to_write + [colocation_ids.to_id(cid)]
 | 
					            to_write = [structure_id] + to_write + [colocation_ids.to_id(cid)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if not self.without_rep:
 | 
					            if not self.all:
 | 
				
			||||||
                to_write.append("") # not yet implemented...
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            if self.group:
 | 
					 | 
				
			||||||
                if colocation_ids.should_write(cid):
 | 
					                if colocation_ids.should_write(cid):
 | 
				
			||||||
                    to_write.append(colocation_ids.num(cid))
 | 
					                    to_write.append(colocation_ids.num(cid))
 | 
				
			||||||
                    colocation_ids.set_written(cid)
 | 
					                    colocation_ids.set_written(cid)
 | 
				
			||||||
@ -941,7 +938,7 @@ class Writer:
 | 
				
			|||||||
class ColocationIds:
 | 
					class ColocationIds:
 | 
				
			||||||
    def __init__(self):
 | 
					    def __init__(self):
 | 
				
			||||||
        self.data = {}
 | 
					        self.data = {}
 | 
				
			||||||
        self.min_frequency = args.group
 | 
					        self.min_frequency = args.min_freq
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_match(self, key):
 | 
					    def add_match(self, key):
 | 
				
			||||||
        if key in self.data:
 | 
					        if key in self.data:
 | 
				
			||||||
@ -992,7 +989,6 @@ def match_file(words, structures):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main(input_file, structures_file, args):
 | 
					def main(input_file, structures_file, args):
 | 
				
			||||||
    writer = Writer(args)
 | 
					 | 
				
			||||||
    structures = build_structures(structures_file)
 | 
					    structures = build_structures(structures_file)
 | 
				
			||||||
    for s in structures:
 | 
					    for s in structures:
 | 
				
			||||||
        logging.debug(str(s))
 | 
					        logging.debug(str(s))
 | 
				
			||||||
@ -1039,7 +1035,9 @@ def main(input_file, structures_file, args):
 | 
				
			|||||||
            else:
 | 
					            else:
 | 
				
			||||||
                matches = colocation_ids.merge_matches(matches, new_matches)
 | 
					                matches = colocation_ids.merge_matches(matches, new_matches)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    writer.write_out(matches, structures, colocation_ids)
 | 
					    if args.all:
 | 
				
			||||||
 | 
					        Writer.make_all_writer(args).write_out(matches, structures, colocation_ids)
 | 
				
			||||||
 | 
					    Writer.make_output_writer(args).write_out(matches, structures, colocation_ids)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    logging.debug([(k, len(v)) for k, v in matches.items()])
 | 
					    logging.debug([(k, len(v)) for k, v in matches.items()])
 | 
				
			||||||
    logging.debug(sum(len(v) for _, v in matches.items()))
 | 
					    logging.debug(sum(len(v) for _, v in matches.items()))
 | 
				
			||||||
@ -1049,12 +1047,11 @@ if __name__ == '__main__':
 | 
				
			|||||||
    parser.add_argument('structures', help='Structures definitions in xml file')
 | 
					    parser.add_argument('structures', help='Structures definitions in xml file')
 | 
				
			||||||
    parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
 | 
					    parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
 | 
				
			||||||
    parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
 | 
					    parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
 | 
				
			||||||
 | 
					    parser.add_argument('--all', help='Additional output file, writes more data')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
 | 
					    parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
 | 
				
			||||||
    parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
 | 
					    parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
 | 
				
			||||||
    parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true')
 | 
					    parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?')
 | 
				
			||||||
    parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')
 | 
					 | 
				
			||||||
    parser.add_argument('--group', help='Group collocations with same collocation ID', type=int, default=0, const=1, nargs='?')
 | 
					 | 
				
			||||||
    parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?')
 | 
					    parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?')
 | 
				
			||||||
    parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
 | 
					    parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
 | 
				
			||||||
    parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')
 | 
					    parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')
 | 
				
			||||||
@ -1072,3 +1069,4 @@ if __name__ == '__main__':
 | 
				
			|||||||
    start = time.time()
 | 
					    start = time.time()
 | 
				
			||||||
    main(args.input, args.structures, args)
 | 
					    main(args.input, args.structures, args)
 | 
				
			||||||
    logging.info("TIME: {}".format(time.time() - start))
 | 
					    logging.info("TIME: {}".format(time.time() - start))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user