Adding restriction on spaces on punctuations.
This commit is contained in:
parent
6dd97838b4
commit
c63a9d47da
|
@ -1,9 +1,10 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from luscenje_struktur.restriction import Restriction
|
# from luscenje_struktur.restriction import Restriction
|
||||||
from luscenje_struktur.order import Order
|
from luscenje_struktur.order import Order
|
||||||
from luscenje_struktur.representation_assigner import RepresentationAssigner
|
from luscenje_struktur.representation_assigner import RepresentationAssigner
|
||||||
|
from luscenje_struktur.restriction_group import RestrictionGroup
|
||||||
|
|
||||||
|
|
||||||
class ComponentStatus(Enum):
|
class ComponentStatus(Enum):
|
||||||
|
@ -38,7 +39,7 @@ class Component:
|
||||||
self.status = status
|
self.status = status
|
||||||
self.name = name
|
self.name = name
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.restrictions = [Restriction(None)] if 'restriction' in info else []
|
self.restrictions = RestrictionGroup([None]) if 'restriction' in info else []
|
||||||
self.next_element = []
|
self.next_element = []
|
||||||
self.representation = []
|
self.representation = []
|
||||||
self.selection = {}
|
self.selection = {}
|
||||||
|
@ -49,15 +50,17 @@ class Component:
|
||||||
def add_next(self, next_component, link_label, order):
|
def add_next(self, next_component, link_label, order):
|
||||||
self.next_element.append((next_component, link_label, Order.new(order)))
|
self.next_element.append((next_component, link_label, Order.new(order)))
|
||||||
|
|
||||||
def set_restriction(self, restrictions_tag):
|
def set_restriction(self, restrictions_tags):
|
||||||
if restrictions_tag is None:
|
if not restrictions_tags:
|
||||||
self.restrictions = [Restriction(None)]
|
self.restrictions = RestrictionGroup([None])
|
||||||
|
|
||||||
elif restrictions_tag.tag == "restriction":
|
# if first element is of type restriction all following are as well
|
||||||
self.restrictions = [Restriction(restrictions_tag)]
|
elif restrictions_tags[0].tag == "restriction":
|
||||||
|
self.restrictions = RestrictionGroup(restrictions_tags)
|
||||||
|
|
||||||
elif restrictions_tag.tag == "restriction_or":
|
# combinations of 'and' and 'or' restrictions are currently not implemented
|
||||||
self.restrictions = [Restriction(el) for el in restrictions_tag]
|
elif restrictions_tags[0].tag == "restriction_or":
|
||||||
|
self.restrictions = RestrictionGroup(restrictions_tags[0], group_type='or')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("Unreachable")
|
raise RuntimeError("Unreachable")
|
||||||
|
@ -118,8 +121,7 @@ class Component:
|
||||||
|
|
||||||
def _match_self(self, word):
|
def _match_self(self, word):
|
||||||
# matching
|
# matching
|
||||||
for restr in self.restrictions:
|
if self.restrictions.match(word):
|
||||||
if restr.match(word): # match either
|
|
||||||
return {self.idx: word}
|
return {self.idx: word}
|
||||||
|
|
||||||
def _match_next(self, word):
|
def _match_next(self, word):
|
||||||
|
|
|
@ -41,7 +41,7 @@ def load_files(args, database, w_collection=None, input_corpus=None):
|
||||||
if extension == ".xml":
|
if extension == ".xml":
|
||||||
et = load_xml(fname)
|
et = load_xml(fname)
|
||||||
if input_corpus is None:
|
if input_corpus is None:
|
||||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
yield file_sentence_generator(et, args)
|
||||||
else:
|
else:
|
||||||
sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
|
sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
|
||||||
for sent_id, sentence, othr_attributes in sentence_generator:
|
for sent_id, sentence, othr_attributes in sentence_generator:
|
||||||
|
@ -189,16 +189,43 @@ def load_xml(filename):
|
||||||
return ElementTree.XML(xmlstring)
|
return ElementTree.XML(xmlstring)
|
||||||
|
|
||||||
|
|
||||||
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
def file_sentence_generator(et, args):
|
||||||
|
skip_id_check = args.skip_id_check
|
||||||
|
do_msd_translate = not args.no_msd_translate
|
||||||
|
pc_tag = args.pc_tag
|
||||||
|
use_punctuations = not args.ignore_punctuations
|
||||||
|
previous_glue = ''
|
||||||
|
previous_pc = False
|
||||||
|
|
||||||
words = {}
|
words = {}
|
||||||
sentences = list(et.iter('s'))
|
sentences = list(et.iter('s'))
|
||||||
for sentence in progress(sentences, "load-text"):
|
for sentence in progress(sentences, "load-text"):
|
||||||
# create fake root word
|
# create fake root word
|
||||||
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
|
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
|
||||||
for w in sentence.iter("w"):
|
last_word_id = None
|
||||||
|
|
||||||
|
for w in sentence.iter():
|
||||||
|
if w.tag == 'w':
|
||||||
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||||
for pc in sentence.iter(pc_tag):
|
if use_punctuations:
|
||||||
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
previous_glue = ''
|
||||||
|
last_word_id = None
|
||||||
|
elif w.tag == pc_tag:
|
||||||
|
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
|
||||||
|
if use_punctuations:
|
||||||
|
last_word_id = w.get('id')
|
||||||
|
words[w.get('id')].previous_glue = previous_glue
|
||||||
|
previous_glue = ''
|
||||||
|
elif use_punctuations and w.tag == 'c':
|
||||||
|
# always save previous glue
|
||||||
|
previous_glue = w.text
|
||||||
|
if last_word_id:
|
||||||
|
words[last_word_id].glue += w.text
|
||||||
|
|
||||||
|
# for w in sentence.iter("w"):
|
||||||
|
# words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||||
|
# for pc in sentence.iter(pc_tag):
|
||||||
|
# words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
||||||
|
|
||||||
for l in sentence.iter("link"):
|
for l in sentence.iter("link"):
|
||||||
if 'dep' in l.keys():
|
if 'dep' in l.keys():
|
||||||
|
|
|
@ -8,6 +8,7 @@ class RestrictionType(Enum):
|
||||||
Morphology = 0
|
Morphology = 0
|
||||||
Lexis = 1
|
Lexis = 1
|
||||||
MatchAll = 2
|
MatchAll = 2
|
||||||
|
Space = 3
|
||||||
|
|
||||||
|
|
||||||
def determine_ppb(rgxs):
|
def determine_ppb(rgxs):
|
||||||
|
@ -123,6 +124,32 @@ class LexisRegex:
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
return text in self.match_list
|
return text in self.match_list
|
||||||
|
|
||||||
|
|
||||||
|
class SpaceRegex:
|
||||||
|
def __init__(self, restriction):
|
||||||
|
restr_dict = {}
|
||||||
|
for feature in restriction:
|
||||||
|
restr_dict.update(feature.items())
|
||||||
|
|
||||||
|
assert "contact" in restr_dict
|
||||||
|
self.space = restr_dict['contact'].split('|')
|
||||||
|
for el in self.space:
|
||||||
|
if el not in ['both', 'right', 'left', 'neither']:
|
||||||
|
raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).')
|
||||||
|
|
||||||
|
def __call__(self, word):
|
||||||
|
match = False
|
||||||
|
if 'both' in self.space:
|
||||||
|
match = match or (word.previous_glue != '' and word.glue != '')
|
||||||
|
if 'right' in self.space:
|
||||||
|
match = match or (word.previous_glue == '' and word.glue != '')
|
||||||
|
if 'left' in self.space:
|
||||||
|
match = match or (word.previous_glue != '' and word.glue == '')
|
||||||
|
if 'neither' in self.space:
|
||||||
|
match = match or (word.previous_glue == '' and word.glue == '')
|
||||||
|
|
||||||
|
return match
|
||||||
|
|
||||||
class Restriction:
|
class Restriction:
|
||||||
def __init__(self, restriction_tag):
|
def __init__(self, restriction_tag):
|
||||||
self.ppb = 4 # polnopomenska beseda (0-4)
|
self.ppb = 4 # polnopomenska beseda (0-4)
|
||||||
|
@ -142,6 +169,10 @@ class Restriction:
|
||||||
elif restriction_type == "lexis":
|
elif restriction_type == "lexis":
|
||||||
self.type = RestrictionType.Lexis
|
self.type = RestrictionType.Lexis
|
||||||
self.matcher = LexisRegex(list(restriction_tag))
|
self.matcher = LexisRegex(list(restriction_tag))
|
||||||
|
|
||||||
|
elif restriction_type == "space":
|
||||||
|
self.type = RestrictionType.Space
|
||||||
|
self.matcher = SpaceRegex(list(restriction_tag))
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@ -152,6 +183,8 @@ class Restriction:
|
||||||
match_to = word.lemma
|
match_to = word.lemma
|
||||||
elif self.type == RestrictionType.MatchAll:
|
elif self.type == RestrictionType.MatchAll:
|
||||||
return True
|
return True
|
||||||
|
elif self.type == RestrictionType.Space:
|
||||||
|
match_to = word
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("Unreachable!")
|
raise RuntimeError("Unreachable!")
|
||||||
|
|
||||||
|
|
24
luscenje_struktur/restriction_group.py
Normal file
24
luscenje_struktur/restriction_group.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
from luscenje_struktur.restriction import Restriction
|
||||||
|
|
||||||
|
class RestrictionGroup:
|
||||||
|
def __init__(self, restrictions_tag, group_type='and'):
|
||||||
|
self.restrictions = [Restriction(el) for el in restrictions_tag]
|
||||||
|
self.group_type = group_type
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for restriction in self.restrictions:
|
||||||
|
yield restriction
|
||||||
|
|
||||||
|
def match(self, word):
|
||||||
|
if self.group_type == 'or':
|
||||||
|
for restr in self.restrictions:
|
||||||
|
if restr.match(word): # match either
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
elif self.group_type == 'and':
|
||||||
|
for restr in self.restrictions:
|
||||||
|
if not restr.match(word): # match and
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
raise Exception("Unsupported group_type - it may only be 'and' or 'or'")
|
|
@ -34,13 +34,12 @@ class SyntacticStructure:
|
||||||
|
|
||||||
for comp in definitions:
|
for comp in definitions:
|
||||||
n = comp.get('cid')
|
n = comp.get('cid')
|
||||||
restrs[n] = None
|
restrs[n] = []
|
||||||
forms[n] = []
|
forms[n] = []
|
||||||
|
|
||||||
for el in comp:
|
for el in comp:
|
||||||
if el.tag.startswith("restriction"):
|
if el.tag.startswith("restriction"):
|
||||||
assert restrs[n] is None
|
restrs[n].append(el)
|
||||||
restrs[n] = el
|
|
||||||
elif el.tag.startswith("representation"):
|
elif el.tag.startswith("representation"):
|
||||||
st.add_representation(n, el, forms)
|
st.add_representation(n, el, forms)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -32,13 +32,14 @@ class WordDummy:
|
||||||
|
|
||||||
|
|
||||||
class Word:
|
class Word:
|
||||||
def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False):
|
def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None):
|
||||||
self.lemma = lemma
|
self.lemma = lemma
|
||||||
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
||||||
self.id = wid
|
self.id = wid
|
||||||
self.idi = None
|
self.idi = None
|
||||||
self.text = text
|
self.text = text
|
||||||
self.glue = ''
|
self.glue = ''
|
||||||
|
self.previous_glue = '' if previous_punctuation is None else previous_punctuation
|
||||||
self.fake_word = fake_word
|
self.fake_word = fake_word
|
||||||
|
|
||||||
self.links = defaultdict(list)
|
self.links = defaultdict(list)
|
||||||
|
|
3
wani.py
3
wani.py
|
@ -153,7 +153,8 @@ if __name__ == '__main__':
|
||||||
help='Tag for separators, usually pc or c', default="pc")
|
help='Tag for separators, usually pc or c', default="pc")
|
||||||
parser.add_argument('--separator',
|
parser.add_argument('--separator',
|
||||||
help='Separator in output file', default="\t")
|
help='Separator in output file', default="\t")
|
||||||
|
parser.add_argument('--ignore-punctuations',
|
||||||
|
help="Sort in reversed ored", action='store_true')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user