diff --git a/luscenje_struktur/component.py b/luscenje_struktur/component.py index 5a46120..752479e 100644 --- a/luscenje_struktur/component.py +++ b/luscenje_struktur/component.py @@ -1,9 +1,10 @@ from enum import Enum import logging -from luscenje_struktur.restriction import Restriction +# from luscenje_struktur.restriction import Restriction from luscenje_struktur.order import Order from luscenje_struktur.representation_assigner import RepresentationAssigner +from luscenje_struktur.restriction_group import RestrictionGroup class ComponentStatus(Enum): @@ -38,7 +39,7 @@ class Component: self.status = status self.name = name self.idx = idx - self.restrictions = [Restriction(None)] if 'restriction' in info else [] + self.restrictions = RestrictionGroup([None]) if 'restriction' in info else [] self.next_element = [] self.representation = [] self.selection = {} @@ -49,15 +50,17 @@ class Component: def add_next(self, next_component, link_label, order): self.next_element.append((next_component, link_label, Order.new(order))) - def set_restriction(self, restrictions_tag): - if restrictions_tag is None: - self.restrictions = [Restriction(None)] + def set_restriction(self, restrictions_tags): + if not restrictions_tags: + self.restrictions = RestrictionGroup([None]) - elif restrictions_tag.tag == "restriction": - self.restrictions = [Restriction(restrictions_tag)] + # if first element is of type restriction all following are as well + elif restrictions_tags[0].tag == "restriction": + self.restrictions = RestrictionGroup(restrictions_tags) - elif restrictions_tag.tag == "restriction_or": - self.restrictions = [Restriction(el) for el in restrictions_tag] + # combinations of 'and' and 'or' restrictions are currently not implemented + elif restrictions_tags[0].tag == "restriction_or": + self.restrictions = RestrictionGroup(restrictions_tags[0], group_type='or') else: raise RuntimeError("Unreachable") @@ -118,9 +121,8 @@ class Component: def _match_self(self, word): # matching - for restr in self.restrictions: - if restr.match(word): # match either - return {self.idx: word} + if self.restrictions.match(word): + return {self.idx: word} def _match_next(self, word): # matches for every component in links from this component diff --git a/luscenje_struktur/loader.py b/luscenje_struktur/loader.py index 3cf932a..f0cea89 100644 --- a/luscenje_struktur/loader.py +++ b/luscenje_struktur/loader.py @@ -41,7 +41,7 @@ def load_files(args, database, w_collection=None, input_corpus=None): if extension == ".xml": et = load_xml(fname) if input_corpus is None: - yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) + yield file_sentence_generator(et, args) else: sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection) for sent_id, sentence, othr_attributes in sentence_generator: @@ -189,16 +189,43 @@ def load_xml(filename): return ElementTree.XML(xmlstring) -def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): +def file_sentence_generator(et, args): + skip_id_check = args.skip_id_check + do_msd_translate = not args.no_msd_translate + pc_tag = args.pc_tag + use_punctuations = not args.ignore_punctuations + previous_glue = '' + previous_pc = False + words = {} sentences = list(et.iter('s')) for sentence in progress(sentences, "load-text"): # create fake root word words[sentence.get('id')] = Word.fake_root_word(sentence.get('id')) - for w in sentence.iter("w"): - words[w.get('id')] = Word.from_xml(w, do_msd_translate) - for pc in sentence.iter(pc_tag): - words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) + last_word_id = None + + for w in sentence.iter(): + if w.tag == 'w': + words[w.get('id')] = Word.from_xml(w, do_msd_translate) + if use_punctuations: + previous_glue = '' + last_word_id = None + elif w.tag == pc_tag: + words[w.get('id')] = Word.pc_word(w, do_msd_translate) + if use_punctuations: + last_word_id = w.get('id') + words[w.get('id')].previous_glue = previous_glue + previous_glue = '' + elif use_punctuations and w.tag == 'c': + # always save previous glue + previous_glue = w.text + if last_word_id: + words[last_word_id].glue += w.text + + # for w in sentence.iter("w"): + # words[w.get('id')] = Word.from_xml(w, do_msd_translate) + # for pc in sentence.iter(pc_tag): + # words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) for l in sentence.iter("link"): if 'dep' in l.keys(): diff --git a/luscenje_struktur/restriction.py b/luscenje_struktur/restriction.py index af4dd68..37bef44 100644 --- a/luscenje_struktur/restriction.py +++ b/luscenje_struktur/restriction.py @@ -8,6 +8,7 @@ class RestrictionType(Enum): Morphology = 0 Lexis = 1 MatchAll = 2 + Space = 3 def determine_ppb(rgxs): @@ -123,6 +124,32 @@ class LexisRegex: def __call__(self, text): return text in self.match_list + +class SpaceRegex: + def __init__(self, restriction): + restr_dict = {} + for feature in restriction: + restr_dict.update(feature.items()) + + assert "contact" in restr_dict + self.space = restr_dict['contact'].split('|') + for el in self.space: + if el not in ['both', 'right', 'left', 'neither']: + raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).') + + def __call__(self, word): + match = False + if 'both' in self.space: + match = match or (word.previous_glue != '' and word.glue != '') + if 'right' in self.space: + match = match or (word.previous_glue == '' and word.glue != '') + if 'left' in self.space: + match = match or (word.previous_glue != '' and word.glue == '') + if 'neither' in self.space: + match = match or (word.previous_glue == '' and word.glue == '') + + return match + class Restriction: def __init__(self, restriction_tag): self.ppb = 4 # polnopomenska beseda (0-4) @@ -142,6 +169,10 @@ class Restriction: elif restriction_type == "lexis": self.type = RestrictionType.Lexis self.matcher = LexisRegex(list(restriction_tag)) + + elif restriction_type == "space": + self.type = RestrictionType.Space + self.matcher = SpaceRegex(list(restriction_tag)) else: raise NotImplementedError() @@ -152,6 +183,8 @@ class Restriction: match_to = word.lemma elif self.type == RestrictionType.MatchAll: return True + elif self.type == RestrictionType.Space: + match_to = word else: raise RuntimeError("Unreachable!") diff --git a/luscenje_struktur/restriction_group.py b/luscenje_struktur/restriction_group.py new file mode 100644 index 0000000..0ae1d35 --- /dev/null +++ b/luscenje_struktur/restriction_group.py @@ -0,0 +1,24 @@ +from luscenje_struktur.restriction import Restriction + +class RestrictionGroup: + def __init__(self, restrictions_tag, group_type='and'): + self.restrictions = [Restriction(el) for el in restrictions_tag] + self.group_type = group_type + + def __iter__(self): + for restriction in self.restrictions: + yield restriction + + def match(self, word): + if self.group_type == 'or': + for restr in self.restrictions: + if restr.match(word): # match either + return True + return False + elif self.group_type == 'and': + for restr in self.restrictions: + if not restr.match(word): # match and + return False + return True + else: + raise Exception("Unsupported group_type - it may only be 'and' or 'or'") diff --git a/luscenje_struktur/syntactic_structure.py b/luscenje_struktur/syntactic_structure.py index 3f06158..79d863c 100644 --- a/luscenje_struktur/syntactic_structure.py +++ b/luscenje_struktur/syntactic_structure.py @@ -34,13 +34,12 @@ class SyntacticStructure: for comp in definitions: n = comp.get('cid') - restrs[n] = None + restrs[n] = [] forms[n] = [] for el in comp: if el.tag.startswith("restriction"): - assert restrs[n] is None - restrs[n] = el + restrs[n].append(el) elif el.tag.startswith("representation"): st.add_representation(n, el, forms) else: diff --git a/luscenje_struktur/word.py b/luscenje_struktur/word.py index bf5c423..93d208a 100644 --- a/luscenje_struktur/word.py +++ b/luscenje_struktur/word.py @@ -32,13 +32,14 @@ class WordDummy: class Word: - def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False): + def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None): self.lemma = lemma self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd self.id = wid self.idi = None self.text = text self.glue = '' + self.previous_glue = '' if previous_punctuation is None else previous_punctuation self.fake_word = fake_word self.links = defaultdict(list) diff --git a/wani.py b/wani.py index 1ba4b7a..35bcf84 100644 --- a/wani.py +++ b/wani.py @@ -153,7 +153,8 @@ if __name__ == '__main__': help='Tag for separators, usually pc or c', default="pc") parser.add_argument('--separator', help='Separator in output file', default="\t") - + parser.add_argument('--ignore-punctuations', + help="Sort in reversed ored", action='store_true') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())