Adding restriction on spaces on punctuations.
This commit is contained in:
parent
6dd97838b4
commit
c63a9d47da
|
@ -1,9 +1,10 @@
|
|||
from enum import Enum
|
||||
import logging
|
||||
|
||||
from luscenje_struktur.restriction import Restriction
|
||||
# from luscenje_struktur.restriction import Restriction
|
||||
from luscenje_struktur.order import Order
|
||||
from luscenje_struktur.representation_assigner import RepresentationAssigner
|
||||
from luscenje_struktur.restriction_group import RestrictionGroup
|
||||
|
||||
|
||||
class ComponentStatus(Enum):
|
||||
|
@ -38,7 +39,7 @@ class Component:
|
|||
self.status = status
|
||||
self.name = name
|
||||
self.idx = idx
|
||||
self.restrictions = [Restriction(None)] if 'restriction' in info else []
|
||||
self.restrictions = RestrictionGroup([None]) if 'restriction' in info else []
|
||||
self.next_element = []
|
||||
self.representation = []
|
||||
self.selection = {}
|
||||
|
@ -49,15 +50,17 @@ class Component:
|
|||
def add_next(self, next_component, link_label, order):
|
||||
self.next_element.append((next_component, link_label, Order.new(order)))
|
||||
|
||||
def set_restriction(self, restrictions_tag):
|
||||
if restrictions_tag is None:
|
||||
self.restrictions = [Restriction(None)]
|
||||
def set_restriction(self, restrictions_tags):
|
||||
if not restrictions_tags:
|
||||
self.restrictions = RestrictionGroup([None])
|
||||
|
||||
elif restrictions_tag.tag == "restriction":
|
||||
self.restrictions = [Restriction(restrictions_tag)]
|
||||
# if first element is of type restriction all following are as well
|
||||
elif restrictions_tags[0].tag == "restriction":
|
||||
self.restrictions = RestrictionGroup(restrictions_tags)
|
||||
|
||||
elif restrictions_tag.tag == "restriction_or":
|
||||
self.restrictions = [Restriction(el) for el in restrictions_tag]
|
||||
# combinations of 'and' and 'or' restrictions are currently not implemented
|
||||
elif restrictions_tags[0].tag == "restriction_or":
|
||||
self.restrictions = RestrictionGroup(restrictions_tags[0], group_type='or')
|
||||
|
||||
else:
|
||||
raise RuntimeError("Unreachable")
|
||||
|
@ -118,9 +121,8 @@ class Component:
|
|||
|
||||
def _match_self(self, word):
|
||||
# matching
|
||||
for restr in self.restrictions:
|
||||
if restr.match(word): # match either
|
||||
return {self.idx: word}
|
||||
if self.restrictions.match(word):
|
||||
return {self.idx: word}
|
||||
|
||||
def _match_next(self, word):
|
||||
# matches for every component in links from this component
|
||||
|
|
|
@ -41,7 +41,7 @@ def load_files(args, database, w_collection=None, input_corpus=None):
|
|||
if extension == ".xml":
|
||||
et = load_xml(fname)
|
||||
if input_corpus is None:
|
||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||
yield file_sentence_generator(et, args)
|
||||
else:
|
||||
sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
|
||||
for sent_id, sentence, othr_attributes in sentence_generator:
|
||||
|
@ -189,16 +189,43 @@ def load_xml(filename):
|
|||
return ElementTree.XML(xmlstring)
|
||||
|
||||
|
||||
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
||||
def file_sentence_generator(et, args):
|
||||
skip_id_check = args.skip_id_check
|
||||
do_msd_translate = not args.no_msd_translate
|
||||
pc_tag = args.pc_tag
|
||||
use_punctuations = not args.ignore_punctuations
|
||||
previous_glue = ''
|
||||
previous_pc = False
|
||||
|
||||
words = {}
|
||||
sentences = list(et.iter('s'))
|
||||
for sentence in progress(sentences, "load-text"):
|
||||
# create fake root word
|
||||
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
|
||||
for w in sentence.iter("w"):
|
||||
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||
for pc in sentence.iter(pc_tag):
|
||||
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
||||
last_word_id = None
|
||||
|
||||
for w in sentence.iter():
|
||||
if w.tag == 'w':
|
||||
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||
if use_punctuations:
|
||||
previous_glue = ''
|
||||
last_word_id = None
|
||||
elif w.tag == pc_tag:
|
||||
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
|
||||
if use_punctuations:
|
||||
last_word_id = w.get('id')
|
||||
words[w.get('id')].previous_glue = previous_glue
|
||||
previous_glue = ''
|
||||
elif use_punctuations and w.tag == 'c':
|
||||
# always save previous glue
|
||||
previous_glue = w.text
|
||||
if last_word_id:
|
||||
words[last_word_id].glue += w.text
|
||||
|
||||
# for w in sentence.iter("w"):
|
||||
# words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||
# for pc in sentence.iter(pc_tag):
|
||||
# words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
||||
|
||||
for l in sentence.iter("link"):
|
||||
if 'dep' in l.keys():
|
||||
|
|
|
@ -8,6 +8,7 @@ class RestrictionType(Enum):
|
|||
Morphology = 0
|
||||
Lexis = 1
|
||||
MatchAll = 2
|
||||
Space = 3
|
||||
|
||||
|
||||
def determine_ppb(rgxs):
|
||||
|
@ -123,6 +124,32 @@ class LexisRegex:
|
|||
def __call__(self, text):
|
||||
return text in self.match_list
|
||||
|
||||
|
||||
class SpaceRegex:
|
||||
def __init__(self, restriction):
|
||||
restr_dict = {}
|
||||
for feature in restriction:
|
||||
restr_dict.update(feature.items())
|
||||
|
||||
assert "contact" in restr_dict
|
||||
self.space = restr_dict['contact'].split('|')
|
||||
for el in self.space:
|
||||
if el not in ['both', 'right', 'left', 'neither']:
|
||||
raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).')
|
||||
|
||||
def __call__(self, word):
|
||||
match = False
|
||||
if 'both' in self.space:
|
||||
match = match or (word.previous_glue != '' and word.glue != '')
|
||||
if 'right' in self.space:
|
||||
match = match or (word.previous_glue == '' and word.glue != '')
|
||||
if 'left' in self.space:
|
||||
match = match or (word.previous_glue != '' and word.glue == '')
|
||||
if 'neither' in self.space:
|
||||
match = match or (word.previous_glue == '' and word.glue == '')
|
||||
|
||||
return match
|
||||
|
||||
class Restriction:
|
||||
def __init__(self, restriction_tag):
|
||||
self.ppb = 4 # polnopomenska beseda (0-4)
|
||||
|
@ -142,6 +169,10 @@ class Restriction:
|
|||
elif restriction_type == "lexis":
|
||||
self.type = RestrictionType.Lexis
|
||||
self.matcher = LexisRegex(list(restriction_tag))
|
||||
|
||||
elif restriction_type == "space":
|
||||
self.type = RestrictionType.Space
|
||||
self.matcher = SpaceRegex(list(restriction_tag))
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@ -152,6 +183,8 @@ class Restriction:
|
|||
match_to = word.lemma
|
||||
elif self.type == RestrictionType.MatchAll:
|
||||
return True
|
||||
elif self.type == RestrictionType.Space:
|
||||
match_to = word
|
||||
else:
|
||||
raise RuntimeError("Unreachable!")
|
||||
|
||||
|
|
24
luscenje_struktur/restriction_group.py
Normal file
24
luscenje_struktur/restriction_group.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
from luscenje_struktur.restriction import Restriction
|
||||
|
||||
class RestrictionGroup:
|
||||
def __init__(self, restrictions_tag, group_type='and'):
|
||||
self.restrictions = [Restriction(el) for el in restrictions_tag]
|
||||
self.group_type = group_type
|
||||
|
||||
def __iter__(self):
|
||||
for restriction in self.restrictions:
|
||||
yield restriction
|
||||
|
||||
def match(self, word):
|
||||
if self.group_type == 'or':
|
||||
for restr in self.restrictions:
|
||||
if restr.match(word): # match either
|
||||
return True
|
||||
return False
|
||||
elif self.group_type == 'and':
|
||||
for restr in self.restrictions:
|
||||
if not restr.match(word): # match and
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
raise Exception("Unsupported group_type - it may only be 'and' or 'or'")
|
|
@ -34,13 +34,12 @@ class SyntacticStructure:
|
|||
|
||||
for comp in definitions:
|
||||
n = comp.get('cid')
|
||||
restrs[n] = None
|
||||
restrs[n] = []
|
||||
forms[n] = []
|
||||
|
||||
for el in comp:
|
||||
if el.tag.startswith("restriction"):
|
||||
assert restrs[n] is None
|
||||
restrs[n] = el
|
||||
restrs[n].append(el)
|
||||
elif el.tag.startswith("representation"):
|
||||
st.add_representation(n, el, forms)
|
||||
else:
|
||||
|
|
|
@ -32,13 +32,14 @@ class WordDummy:
|
|||
|
||||
|
||||
class Word:
|
||||
def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False):
|
||||
def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None):
|
||||
self.lemma = lemma
|
||||
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
||||
self.id = wid
|
||||
self.idi = None
|
||||
self.text = text
|
||||
self.glue = ''
|
||||
self.previous_glue = '' if previous_punctuation is None else previous_punctuation
|
||||
self.fake_word = fake_word
|
||||
|
||||
self.links = defaultdict(list)
|
||||
|
|
3
wani.py
3
wani.py
|
@ -153,7 +153,8 @@ if __name__ == '__main__':
|
|||
help='Tag for separators, usually pc or c', default="pc")
|
||||
parser.add_argument('--separator',
|
||||
help='Separator in output file', default="\t")
|
||||
|
||||
parser.add_argument('--ignore-punctuations',
|
||||
help="Sort in reversed ored", action='store_true')
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user