Adding restriction on spaces on punctuations.

2020-10-22 13:16:58 +02:00 · 2020-10-22 13:16:58 +02:00 · c63a9d47da
commit c63a9d47da
parent 6dd97838b4
7 changed files with 110 additions and 23 deletions
--- a/luscenje_struktur/component.py
+++ b/luscenje_struktur/component.py
@ -1,9 +1,10 @@
 from enum import Enum
 import logging
-from luscenje_struktur.restriction import Restriction
+# from luscenje_struktur.restriction import Restriction
 from luscenje_struktur.order import Order
 from luscenje_struktur.representation_assigner import RepresentationAssigner
 from luscenje_struktur.restriction_group import RestrictionGroup
 class ComponentStatus(Enum):
@ -38,7 +39,7 @@ class Component:
        self.status = status
        self.name = name
        self.idx = idx
-        self.restrictions = [Restriction(None)] if 'restriction' in info else []
+        self.restrictions = RestrictionGroup([None]) if 'restriction' in info else []
        self.next_element = []
        self.representation = []
        self.selection = {}
@ -49,15 +50,17 @@ class Component:
    def add_next(self, next_component, link_label, order):
        self.next_element.append((next_component, link_label, Order.new(order)))
-    def set_restriction(self, restrictions_tag):
+    def set_restriction(self, restrictions_tags):
-        if restrictions_tag is None:
+        if not restrictions_tags:
-            self.restrictions = [Restriction(None)]
+            self.restrictions = RestrictionGroup([None])
-        elif restrictions_tag.tag == "restriction":
+        # if first element is of type restriction all following are as well
-            self.restrictions = [Restriction(restrictions_tag)]
+        elif restrictions_tags[0].tag == "restriction":
            self.restrictions = RestrictionGroup(restrictions_tags)
-        elif restrictions_tag.tag == "restriction_or":
+        # combinations of 'and' and 'or' restrictions are currently not implemented
-            self.restrictions = [Restriction(el) for el in restrictions_tag]
+        elif restrictions_tags[0].tag == "restriction_or":
            self.restrictions = RestrictionGroup(restrictions_tags[0], group_type='or')
        else:
            raise RuntimeError("Unreachable")
@ -118,8 +121,7 @@ class Component:
    def _match_self(self, word):
        # matching
-        for restr in self.restrictions:
+        if self.restrictions.match(word):
            if restr.match(word): # match either
            return {self.idx: word}
    def _match_next(self, word):
--- a/luscenje_struktur/loader.py
+++ b/luscenje_struktur/loader.py
@ -41,7 +41,7 @@ def load_files(args, database, w_collection=None, input_corpus=None):
        if extension == ".xml":
            et = load_xml(fname)
            if input_corpus is None:
-                yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
+                yield file_sentence_generator(et, args)
            else:
                sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
                for sent_id, sentence, othr_attributes in sentence_generator:
@ -189,16 +189,43 @@ def load_xml(filename):
    return ElementTree.XML(xmlstring)
-def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
+def file_sentence_generator(et, args):
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate
    pc_tag = args.pc_tag
    use_punctuations = not args.ignore_punctuations
    previous_glue = ''
    previous_pc = False
    words = {}
    sentences = list(et.iter('s'))
    for sentence in progress(sentences, "load-text"):
        # create fake root word
        words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
-        for w in sentence.iter("w"):
+        last_word_id = None
        for w in sentence.iter():
            if w.tag == 'w':
                words[w.get('id')] = Word.from_xml(w, do_msd_translate)
-        for pc in sentence.iter(pc_tag):
+                if use_punctuations:
-            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
+                    previous_glue = ''
                    last_word_id = None
            elif w.tag == pc_tag:
                words[w.get('id')] = Word.pc_word(w, do_msd_translate)
                if use_punctuations:
                    last_word_id = w.get('id')
                    words[w.get('id')].previous_glue = previous_glue
                    previous_glue = ''
            elif use_punctuations and w.tag == 'c':
                # always save previous glue
                previous_glue = w.text
                if last_word_id:
                    words[last_word_id].glue += w.text
        # for w in sentence.iter("w"):
        #     words[w.get('id')] = Word.from_xml(w, do_msd_translate)
        # for pc in sentence.iter(pc_tag):
        #     words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
        for l in sentence.iter("link"):
            if 'dep' in l.keys():
--- a/luscenje_struktur/restriction.py
+++ b/luscenje_struktur/restriction.py
@ -8,6 +8,7 @@ class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1
    MatchAll = 2
    Space = 3
 def determine_ppb(rgxs):
@ -123,6 +124,32 @@ class LexisRegex:
    def __call__(self, text):
        return text in self.match_list
 class SpaceRegex:
    def __init__(self, restriction):
        restr_dict = {}
        for feature in restriction:
            restr_dict.update(feature.items())
        assert "contact" in restr_dict
        self.space = restr_dict['contact'].split('|')
        for el in self.space:
            if el not in ['both', 'right', 'left', 'neither']:
                raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).')
    def __call__(self, word):
        match = False
        if 'both' in self.space:
            match = match or (word.previous_glue != '' and word.glue != '')
        if 'right' in self.space:
            match = match or (word.previous_glue == '' and word.glue != '')
        if 'left' in self.space:
            match = match or (word.previous_glue != '' and word.glue == '')
        if 'neither' in self.space:
            match = match or (word.previous_glue == '' and word.glue == '')
        return match
 class Restriction:
    def __init__(self, restriction_tag):
        self.ppb = 4 # polnopomenska beseda (0-4)
@ -142,6 +169,10 @@ class Restriction:
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
            self.matcher = LexisRegex(list(restriction_tag))
        elif restriction_type == "space":
            self.type = RestrictionType.Space
            self.matcher = SpaceRegex(list(restriction_tag))
        else:
            raise NotImplementedError()
@ -152,6 +183,8 @@ class Restriction:
            match_to = word.lemma
        elif self.type == RestrictionType.MatchAll:
            return True
        elif self.type == RestrictionType.Space:
            match_to = word
        else:
            raise RuntimeError("Unreachable!")
--- a/luscenje_struktur/restriction_group.py
+++ b/luscenje_struktur/restriction_group.py
@ -0,0 +1,24 @@
 from luscenje_struktur.restriction import Restriction
 class RestrictionGroup:
    def __init__(self, restrictions_tag, group_type='and'):
        self.restrictions = [Restriction(el) for el in restrictions_tag]
        self.group_type = group_type
    def __iter__(self):
        for restriction in self.restrictions:
            yield restriction
    def match(self, word):
        if self.group_type == 'or':
            for restr in self.restrictions:
                if restr.match(word): # match either
                    return True
            return False
        elif self.group_type == 'and':
            for restr in self.restrictions:
                if not restr.match(word): # match and
                    return False
            return True
        else:
            raise Exception("Unsupported group_type - it may only be 'and' or 'or'")
--- a/luscenje_struktur/syntactic_structure.py
+++ b/luscenje_struktur/syntactic_structure.py
@ -34,13 +34,12 @@ class SyntacticStructure:
        for comp in definitions:
            n = comp.get('cid')
-            restrs[n] = None
+            restrs[n] = []
            forms[n] = []
            for el in comp:
                if el.tag.startswith("restriction"):
-                    assert restrs[n] is None
+                    restrs[n].append(el)
                    restrs[n] = el
                elif el.tag.startswith("representation"):
                    st.add_representation(n, el, forms)
                else:
--- a/luscenje_struktur/word.py
+++ b/luscenje_struktur/word.py
@ -32,13 +32,14 @@ class WordDummy:
 class Word:
-    def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False):
+    def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None):
        self.lemma = lemma
        self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
        self.id = wid
        self.idi = None
        self.text = text
        self.glue = ''
        self.previous_glue = '' if previous_punctuation is None else previous_punctuation
        self.fake_word = fake_word
        self.links = defaultdict(list)
--- a/wani.py
+++ b/wani.py
@ -153,7 +153,8 @@ if __name__ == '__main__':
                        help='Tag for separators, usually pc or c', default="pc")
    parser.add_argument('--separator',
                        help='Separator in output file', default="\t")
-
+    parser.add_argument('--ignore-punctuations',
                        help="Sort in reversed ored", action='store_true')
    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())