Ignoring @type=single and added option for --new-tei

This commit is contained in:
Luka 2021-01-13 16:36:44 +01:00
parent fa4479af60
commit 361331515e
3 changed files with 35 additions and 17 deletions

View File

@ -204,23 +204,36 @@ def file_sentence_generator(et, args):
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id')) words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
last_word_id = None last_word_id = None
for w in sentence.iter(): if args.new_tei:
if w.tag == 'w': for w in sentence.iter():
words[w.get('id')] = Word.from_xml(w, do_msd_translate) if w.tag == 'w':
if use_punctuations: words[w.get('id')] = Word.from_xml(w, do_msd_translate)
previous_glue = '' if use_punctuations:
last_word_id = None previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
elif w.tag == pc_tag: elif w.tag == pc_tag:
words[w.get('id')] = Word.pc_word(w, do_msd_translate) words[w.get('id')] = Word.pc_word(w, do_msd_translate)
if use_punctuations: if use_punctuations:
last_word_id = w.get('id') words[w.get('id')].previous_glue = previous_glue
words[w.get('id')].previous_glue = previous_glue words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
previous_glue = '' previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
elif use_punctuations and w.tag == 'c': else:
# always save previous glue for w in sentence.iter():
previous_glue = w.text if w.tag == 'w':
if last_word_id: words[w.get('id')] = Word.from_xml(w, do_msd_translate)
words[last_word_id].glue += w.text if use_punctuations:
previous_glue = ''
last_word_id = None
elif w.tag == pc_tag:
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
if use_punctuations:
last_word_id = w.get('id')
words[w.get('id')].previous_glue = previous_glue
previous_glue = ''
elif use_punctuations and w.tag == 'c':
# always save previous glue
previous_glue = w.text
if last_word_id:
words[last_word_id].glue += w.text
# for w in sentence.iter("w"): # for w in sentence.iter("w"):
# words[w.get('id')] = Word.from_xml(w, do_msd_translate) # words[w.get('id')] = Word.from_xml(w, do_msd_translate)

View File

@ -115,6 +115,8 @@ def build_structures(args):
structures = [] structures = []
for structure in et.iter('syntactic_structure'): for structure in et.iter('syntactic_structure'):
if structure.attrib['type'] == 'single':
continue
to_append = SyntacticStructure.from_xml(structure, no_stats) to_append = SyntacticStructure.from_xml(structure, no_stats)
if to_append is None: if to_append is None:
continue continue

View File

@ -160,6 +160,9 @@ if __name__ == '__main__':
parser.add_argument('--fixed-restriction-order', parser.add_argument('--fixed-restriction-order',
help='If used, words have to be in the same order as components.', help='If used, words have to be in the same order as components.',
action='store_true') action='store_true')
parser.add_argument('--new-tei',
help='Attribute to be used, when using new version of tei. (default=False)',
action='store_true')
args = parser.parse_args() args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())