Adding restriction on spaces on punctuations.

2020-10-22 13:16:58 +02:00
parent 6dd97838b4
commit c63a9d47da
7 changed files with 110 additions and 23 deletions
@@ -1,9 +1,10 @@
 from enum import Enum
 import logging

-from luscenje_struktur.restriction import Restriction
+# from luscenje_struktur.restriction import Restriction
 from luscenje_struktur.order import Order
 from luscenje_struktur.representation_assigner import RepresentationAssigner
+from luscenje_struktur.restriction_group import RestrictionGroup


 class ComponentStatus(Enum):
@@ -38,7 +39,7 @@ class Component:
        self.status = status
        self.name = name
        self.idx = idx
-        self.restrictions = [Restriction(None)] if 'restriction' in info else []
+        self.restrictions = RestrictionGroup([None]) if 'restriction' in info else []
        self.next_element = []
        self.representation = []
        self.selection = {}
@@ -49,15 +50,17 @@ class Component:
    def add_next(self, next_component, link_label, order):
        self.next_element.append((next_component, link_label, Order.new(order)))

-    def set_restriction(self, restrictions_tag):
-        if restrictions_tag is None:
-            self.restrictions = [Restriction(None)]
+    def set_restriction(self, restrictions_tags):
+        if not restrictions_tags:
+            self.restrictions = RestrictionGroup([None])

-        elif restrictions_tag.tag == "restriction":
-            self.restrictions = [Restriction(restrictions_tag)]
+        # if first element is of type restriction all following are as well
+        elif restrictions_tags[0].tag == "restriction":
+            self.restrictions = RestrictionGroup(restrictions_tags)

-        elif restrictions_tag.tag == "restriction_or":
-            self.restrictions = [Restriction(el) for el in restrictions_tag]
+        # combinations of 'and' and 'or' restrictions are currently not implemented
+        elif restrictions_tags[0].tag == "restriction_or":
+            self.restrictions = RestrictionGroup(restrictions_tags[0], group_type='or')

        else:
            raise RuntimeError("Unreachable")
@@ -118,8 +121,7 @@ class Component:

    def _match_self(self, word):
        # matching
-        for restr in self.restrictions:
-            if restr.match(word): # match either
+        if self.restrictions.match(word):
            return {self.idx: word}

    def _match_next(self, word):
@@ -41,7 +41,7 @@ def load_files(args, database, w_collection=None, input_corpus=None):
        if extension == ".xml":
            et = load_xml(fname)
            if input_corpus is None:
-                yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
+                yield file_sentence_generator(et, args)
            else:
                sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
                for sent_id, sentence, othr_attributes in sentence_generator:
@@ -189,16 +189,43 @@ def load_xml(filename):
    return ElementTree.XML(xmlstring)


-def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
+def file_sentence_generator(et, args):
+    skip_id_check = args.skip_id_check
+    do_msd_translate = not args.no_msd_translate
+    pc_tag = args.pc_tag
+    use_punctuations = not args.ignore_punctuations
+    previous_glue = ''
+    previous_pc = False
+
    words = {}
    sentences = list(et.iter('s'))
    for sentence in progress(sentences, "load-text"):
        # create fake root word
        words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
-        for w in sentence.iter("w"):
+        last_word_id = None
+
+        for w in sentence.iter():
+            if w.tag == 'w':
                words[w.get('id')] = Word.from_xml(w, do_msd_translate)
-        for pc in sentence.iter(pc_tag):
-            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
+                if use_punctuations:
+                    previous_glue = ''
+                    last_word_id = None
+            elif w.tag == pc_tag:
+                words[w.get('id')] = Word.pc_word(w, do_msd_translate)
+                if use_punctuations:
+                    last_word_id = w.get('id')
+                    words[w.get('id')].previous_glue = previous_glue
+                    previous_glue = ''
+            elif use_punctuations and w.tag == 'c':
+                # always save previous glue
+                previous_glue = w.text
+                if last_word_id:
+                    words[last_word_id].glue += w.text
+
+        # for w in sentence.iter("w"):
+        #     words[w.get('id')] = Word.from_xml(w, do_msd_translate)
+        # for pc in sentence.iter(pc_tag):
+        #     words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)

        for l in sentence.iter("link"):
            if 'dep' in l.keys():
@@ -8,6 +8,7 @@ class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1
    MatchAll = 2
+    Space = 3


 def determine_ppb(rgxs):
@@ -123,6 +124,32 @@ class LexisRegex:
    def __call__(self, text):
        return text in self.match_list

+
+class SpaceRegex:
+    def __init__(self, restriction):
+        restr_dict = {}
+        for feature in restriction:
+            restr_dict.update(feature.items())
+
+        assert "contact" in restr_dict
+        self.space = restr_dict['contact'].split('|')
+        for el in self.space:
+            if el not in ['both', 'right', 'left', 'neither']:
+                raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).')
+
+    def __call__(self, word):
+        match = False
+        if 'both' in self.space:
+            match = match or (word.previous_glue != '' and word.glue != '')
+        if 'right' in self.space:
+            match = match or (word.previous_glue == '' and word.glue != '')
+        if 'left' in self.space:
+            match = match or (word.previous_glue != '' and word.glue == '')
+        if 'neither' in self.space:
+            match = match or (word.previous_glue == '' and word.glue == '')
+
+        return match
+
 class Restriction:
    def __init__(self, restriction_tag):
        self.ppb = 4 # polnopomenska beseda (0-4)
@@ -142,6 +169,10 @@ class Restriction:
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
            self.matcher = LexisRegex(list(restriction_tag))
+
+        elif restriction_type == "space":
+            self.type = RestrictionType.Space
+            self.matcher = SpaceRegex(list(restriction_tag))
        else:
            raise NotImplementedError()

@@ -152,6 +183,8 @@ class Restriction:
            match_to = word.lemma
        elif self.type == RestrictionType.MatchAll:
            return True
+        elif self.type == RestrictionType.Space:
+            match_to = word
        else:
            raise RuntimeError("Unreachable!")

@@ -0,0 +1,24 @@
+from luscenje_struktur.restriction import Restriction
+
+class RestrictionGroup:
+    def __init__(self, restrictions_tag, group_type='and'):
+        self.restrictions = [Restriction(el) for el in restrictions_tag]
+        self.group_type = group_type
+
+    def __iter__(self):
+        for restriction in self.restrictions:
+            yield restriction
+
+    def match(self, word):
+        if self.group_type == 'or':
+            for restr in self.restrictions:
+                if restr.match(word): # match either
+                    return True
+            return False
+        elif self.group_type == 'and':
+            for restr in self.restrictions:
+                if not restr.match(word): # match and
+                    return False
+            return True
+        else:
+            raise Exception("Unsupported group_type - it may only be 'and' or 'or'")
@@ -34,13 +34,12 @@ class SyntacticStructure:

        for comp in definitions:
            n = comp.get('cid')
-            restrs[n] = None
+            restrs[n] = []
            forms[n] = []

            for el in comp:
                if el.tag.startswith("restriction"):
-                    assert restrs[n] is None
-                    restrs[n] = el
+                    restrs[n].append(el)
                elif el.tag.startswith("representation"):
                    st.add_representation(n, el, forms)
                else:
@@ -32,13 +32,14 @@ class WordDummy:


 class Word:
-    def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False):
+    def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None):
        self.lemma = lemma
        self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
        self.id = wid
        self.idi = None
        self.text = text
        self.glue = ''
+        self.previous_glue = '' if previous_punctuation is None else previous_punctuation
        self.fake_word = fake_word

        self.links = defaultdict(list)
@@ -153,7 +153,8 @@ if __name__ == '__main__':
                        help='Tag for separators, usually pc or c', default="pc")
    parser.add_argument('--separator',
                        help='Separator in output file', default="\t")
-
+    parser.add_argument('--ignore-punctuations',
+                        help="Sort in reversed ored", action='store_true')
    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())