Adding restriction on spaces on punctuations.

This commit is contained in:
Luka 2020-10-22 13:16:58 +02:00
parent 6dd97838b4
commit c63a9d47da
7 changed files with 110 additions and 23 deletions

View File

@ -1,9 +1,10 @@
from enum import Enum
import logging
from luscenje_struktur.restriction import Restriction
# from luscenje_struktur.restriction import Restriction
from luscenje_struktur.order import Order
from luscenje_struktur.representation_assigner import RepresentationAssigner
from luscenje_struktur.restriction_group import RestrictionGroup
class ComponentStatus(Enum):
@ -38,7 +39,7 @@ class Component:
self.status = status
self.name = name
self.idx = idx
self.restrictions = [Restriction(None)] if 'restriction' in info else []
self.restrictions = RestrictionGroup([None]) if 'restriction' in info else []
self.next_element = []
self.representation = []
self.selection = {}
@ -49,15 +50,17 @@ class Component:
def add_next(self, next_component, link_label, order):
self.next_element.append((next_component, link_label, Order.new(order)))
def set_restriction(self, restrictions_tag):
if restrictions_tag is None:
self.restrictions = [Restriction(None)]
def set_restriction(self, restrictions_tags):
if not restrictions_tags:
self.restrictions = RestrictionGroup([None])
elif restrictions_tag.tag == "restriction":
self.restrictions = [Restriction(restrictions_tag)]
# if first element is of type restriction all following are as well
elif restrictions_tags[0].tag == "restriction":
self.restrictions = RestrictionGroup(restrictions_tags)
elif restrictions_tag.tag == "restriction_or":
self.restrictions = [Restriction(el) for el in restrictions_tag]
# combinations of 'and' and 'or' restrictions are currently not implemented
elif restrictions_tags[0].tag == "restriction_or":
self.restrictions = RestrictionGroup(restrictions_tags[0], group_type='or')
else:
raise RuntimeError("Unreachable")
@ -118,8 +121,7 @@ class Component:
def _match_self(self, word):
# matching
for restr in self.restrictions:
if restr.match(word): # match either
if self.restrictions.match(word):
return {self.idx: word}
def _match_next(self, word):

View File

@ -41,7 +41,7 @@ def load_files(args, database, w_collection=None, input_corpus=None):
if extension == ".xml":
et = load_xml(fname)
if input_corpus is None:
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
yield file_sentence_generator(et, args)
else:
sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
for sent_id, sentence, othr_attributes in sentence_generator:
@ -189,16 +189,43 @@ def load_xml(filename):
return ElementTree.XML(xmlstring)
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
def file_sentence_generator(et, args):
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
pc_tag = args.pc_tag
use_punctuations = not args.ignore_punctuations
previous_glue = ''
previous_pc = False
words = {}
sentences = list(et.iter('s'))
for sentence in progress(sentences, "load-text"):
# create fake root word
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
for w in sentence.iter("w"):
last_word_id = None
for w in sentence.iter():
if w.tag == 'w':
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
for pc in sentence.iter(pc_tag):
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
if use_punctuations:
previous_glue = ''
last_word_id = None
elif w.tag == pc_tag:
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
if use_punctuations:
last_word_id = w.get('id')
words[w.get('id')].previous_glue = previous_glue
previous_glue = ''
elif use_punctuations and w.tag == 'c':
# always save previous glue
previous_glue = w.text
if last_word_id:
words[last_word_id].glue += w.text
# for w in sentence.iter("w"):
# words[w.get('id')] = Word.from_xml(w, do_msd_translate)
# for pc in sentence.iter(pc_tag):
# words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
for l in sentence.iter("link"):
if 'dep' in l.keys():

View File

@ -8,6 +8,7 @@ class RestrictionType(Enum):
Morphology = 0
Lexis = 1
MatchAll = 2
Space = 3
def determine_ppb(rgxs):
@ -123,6 +124,32 @@ class LexisRegex:
def __call__(self, text):
return text in self.match_list
class SpaceRegex:
def __init__(self, restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
assert "contact" in restr_dict
self.space = restr_dict['contact'].split('|')
for el in self.space:
if el not in ['both', 'right', 'left', 'neither']:
raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).')
def __call__(self, word):
match = False
if 'both' in self.space:
match = match or (word.previous_glue != '' and word.glue != '')
if 'right' in self.space:
match = match or (word.previous_glue == '' and word.glue != '')
if 'left' in self.space:
match = match or (word.previous_glue != '' and word.glue == '')
if 'neither' in self.space:
match = match or (word.previous_glue == '' and word.glue == '')
return match
class Restriction:
def __init__(self, restriction_tag):
self.ppb = 4 # polnopomenska beseda (0-4)
@ -142,6 +169,10 @@ class Restriction:
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis
self.matcher = LexisRegex(list(restriction_tag))
elif restriction_type == "space":
self.type = RestrictionType.Space
self.matcher = SpaceRegex(list(restriction_tag))
else:
raise NotImplementedError()
@ -152,6 +183,8 @@ class Restriction:
match_to = word.lemma
elif self.type == RestrictionType.MatchAll:
return True
elif self.type == RestrictionType.Space:
match_to = word
else:
raise RuntimeError("Unreachable!")

View File

@ -0,0 +1,24 @@
from luscenje_struktur.restriction import Restriction
class RestrictionGroup:
def __init__(self, restrictions_tag, group_type='and'):
self.restrictions = [Restriction(el) for el in restrictions_tag]
self.group_type = group_type
def __iter__(self):
for restriction in self.restrictions:
yield restriction
def match(self, word):
if self.group_type == 'or':
for restr in self.restrictions:
if restr.match(word): # match either
return True
return False
elif self.group_type == 'and':
for restr in self.restrictions:
if not restr.match(word): # match and
return False
return True
else:
raise Exception("Unsupported group_type - it may only be 'and' or 'or'")

View File

@ -34,13 +34,12 @@ class SyntacticStructure:
for comp in definitions:
n = comp.get('cid')
restrs[n] = None
restrs[n] = []
forms[n] = []
for el in comp:
if el.tag.startswith("restriction"):
assert restrs[n] is None
restrs[n] = el
restrs[n].append(el)
elif el.tag.startswith("representation"):
st.add_representation(n, el, forms)
else:

View File

@ -32,13 +32,14 @@ class WordDummy:
class Word:
def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False):
def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None):
self.lemma = lemma
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
self.id = wid
self.idi = None
self.text = text
self.glue = ''
self.previous_glue = '' if previous_punctuation is None else previous_punctuation
self.fake_word = fake_word
self.links = defaultdict(list)

View File

@ -153,7 +153,8 @@ if __name__ == '__main__':
help='Tag for separators, usually pc or c', default="pc")
parser.add_argument('--separator',
help='Separator in output file', default="\t")
parser.add_argument('--ignore-punctuations',
help="Sort in reversed ored", action='store_true')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())