White reset at paragraphs not sentences + progress bar updates on paragraphs not sentences.
This commit is contained in:
parent
552f2e4bd0
commit
f1366548b6
|
@ -197,76 +197,73 @@ def file_sentence_generator(et, args):
|
||||||
previous_pc = False
|
previous_pc = False
|
||||||
|
|
||||||
words = {}
|
words = {}
|
||||||
sentences = list(et.iter('s'))
|
paragraphs = list(et.iter('p'))
|
||||||
for sentence in progress(sentences, "load-text"):
|
for paragraph in progress(paragraphs, "load-text"):
|
||||||
# create fake root word
|
|
||||||
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
|
|
||||||
last_word_id = None
|
|
||||||
previous_glue = ''
|
previous_glue = ''
|
||||||
|
sentences = list(paragraph.iter('s'))
|
||||||
|
for sentence in sentences:
|
||||||
|
# create fake root word
|
||||||
|
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
|
||||||
|
last_word_id = None
|
||||||
|
|
||||||
if args.new_tei:
|
if args.new_tei:
|
||||||
for w in sentence.iter():
|
for w in sentence.iter():
|
||||||
if w.tag == 'w':
|
if w.tag == 'w':
|
||||||
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||||
if use_punctuations:
|
if use_punctuations:
|
||||||
previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
|
previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
|
||||||
elif w.tag == pc_tag:
|
elif w.tag == pc_tag:
|
||||||
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
|
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
|
||||||
if use_punctuations:
|
if use_punctuations:
|
||||||
words[w.get('id')].previous_glue = previous_glue
|
words[w.get('id')].previous_glue = previous_glue
|
||||||
words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
|
words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
|
||||||
previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
|
previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
|
||||||
else:
|
|
||||||
for w in sentence.iter():
|
|
||||||
if w.tag == 'w':
|
|
||||||
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
|
||||||
if use_punctuations:
|
|
||||||
previous_glue = ''
|
|
||||||
last_word_id = None
|
|
||||||
elif w.tag == pc_tag:
|
|
||||||
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
|
|
||||||
if use_punctuations:
|
|
||||||
last_word_id = w.get('id')
|
|
||||||
words[w.get('id')].previous_glue = previous_glue
|
|
||||||
previous_glue = ''
|
|
||||||
elif use_punctuations and w.tag == 'c':
|
|
||||||
# always save previous glue
|
|
||||||
previous_glue = w.text
|
|
||||||
if last_word_id:
|
|
||||||
words[last_word_id].glue += w.text
|
|
||||||
|
|
||||||
# for w in sentence.iter("w"):
|
|
||||||
# words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
|
||||||
# for pc in sentence.iter(pc_tag):
|
|
||||||
# words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
|
||||||
|
|
||||||
for l in sentence.iter("link"):
|
|
||||||
if 'dep' in l.keys():
|
|
||||||
ana = l.get('afun')
|
|
||||||
lfrom = l.get('from')
|
|
||||||
dest = l.get('dep')
|
|
||||||
else:
|
else:
|
||||||
ana = l.get('ana')
|
for w in sentence.iter():
|
||||||
if ana[:8] != 'jos-syn:': # dont bother...
|
if w.tag == 'w':
|
||||||
continue
|
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||||
ana = ana[8:]
|
if use_punctuations:
|
||||||
lfrom, dest = l.get('target').replace('#', '').split()
|
previous_glue = ''
|
||||||
|
last_word_id = None
|
||||||
|
elif w.tag == pc_tag:
|
||||||
|
words[w.get('id')] = Word.pc_word(w, do_msd_translate)
|
||||||
|
if use_punctuations:
|
||||||
|
last_word_id = w.get('id')
|
||||||
|
words[w.get('id')].previous_glue = previous_glue
|
||||||
|
previous_glue = ''
|
||||||
|
elif use_punctuations and w.tag == 'c':
|
||||||
|
# always save previous glue
|
||||||
|
previous_glue = w.text
|
||||||
|
if last_word_id:
|
||||||
|
words[last_word_id].glue += w.text
|
||||||
|
|
||||||
if lfrom in words:
|
for l in sentence.iter("link"):
|
||||||
if not skip_id_check and is_root_id(lfrom):
|
if 'dep' in l.keys():
|
||||||
logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom))
|
ana = l.get('afun')
|
||||||
sys.exit(1)
|
lfrom = l.get('from')
|
||||||
|
dest = l.get('dep')
|
||||||
if dest in words:
|
|
||||||
next_word = words[dest]
|
|
||||||
words[lfrom].add_link(ana, next_word)
|
|
||||||
else:
|
else:
|
||||||
logging.error("Unknown id: {}".format(dest))
|
ana = l.get('ana')
|
||||||
sys.exit(1)
|
if ana[:8] != 'jos-syn:': # dont bother...
|
||||||
|
continue
|
||||||
|
ana = ana[8:]
|
||||||
|
lfrom, dest = l.get('target').replace('#', '').split()
|
||||||
|
|
||||||
else:
|
if lfrom in words:
|
||||||
# strange errors, just skip...
|
if not skip_id_check and is_root_id(lfrom):
|
||||||
pass
|
logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if dest in words:
|
||||||
|
next_word = words[dest]
|
||||||
|
words[lfrom].add_link(ana, next_word)
|
||||||
|
else:
|
||||||
|
logging.error("Unknown id: {}".format(dest))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# strange errors, just skip...
|
||||||
|
pass
|
||||||
|
|
||||||
return list(words.values())
|
return list(words.values())
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user