EMPTY COMMIT - removing too long lines
This commit is contained in:
parent
797060f619
commit
46e169095c
75
wani.py
75
wani.py
|
@ -279,7 +279,8 @@ class WordFormAgreementCR(WordFormMsdCR):
|
||||||
def match(self, word_msd):
|
def match(self, word_msd):
|
||||||
existing = [(w.msd, w.text) for w in self.words]
|
existing = [(w.msd, w.text) for w in self.words]
|
||||||
|
|
||||||
for candidate_msd, candidate_text in self.word_renderer.available_words(self.lemma, existing):
|
lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
|
||||||
|
for candidate_msd, candidate_text in lemma_available_words:
|
||||||
if self.msd[0] != candidate_msd[0]:
|
if self.msd[0] != candidate_msd[0]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -299,7 +300,8 @@ class WordFormAgreementCR(WordFormMsdCR):
|
||||||
t1 = msd1[0]
|
t1 = msd1[0]
|
||||||
# if not in msd, some strange msd was tries, skipping...
|
# if not in msd, some strange msd was tries, skipping...
|
||||||
if agr_case not in TAGSET[t1]:
|
if agr_case not in TAGSET[t1]:
|
||||||
logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
|
logging.warning("Cannot do agreement: {} for msd {} not found!"
|
||||||
|
.format(agr_case, msd1))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
v1 = TAGSET[t1].index(agr_case)
|
v1 = TAGSET[t1].index(agr_case)
|
||||||
|
@ -312,7 +314,8 @@ class WordFormAgreementCR(WordFormMsdCR):
|
||||||
# REPEAT (not DRY!)
|
# REPEAT (not DRY!)
|
||||||
t2 = msd2[0]
|
t2 = msd2[0]
|
||||||
if agr_case not in TAGSET[t2]:
|
if agr_case not in TAGSET[t2]:
|
||||||
logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
|
logging.warning("Cannot do agreement: {} for msd {} not found!"
|
||||||
|
.format(agr_case, msd2))
|
||||||
return False
|
return False
|
||||||
v2 = TAGSET[t2].index(agr_case)
|
v2 = TAGSET[t2].index(agr_case)
|
||||||
if v2 + 1 >= len(msd2):
|
if v2 + 1 >= len(msd2):
|
||||||
|
@ -707,7 +710,8 @@ class SyntacticStructure:
|
||||||
assert(system.get('type') == 'JOS')
|
assert(system.get('type') == 'JOS')
|
||||||
components, dependencies, definitions = list(system)
|
components, dependencies, definitions = list(system)
|
||||||
|
|
||||||
deps = [ (dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) for dep in dependencies ]
|
deps = [(dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order'))
|
||||||
|
for dep in dependencies]
|
||||||
comps = { comp.get('cid'): dict(comp.items()) for comp in components }
|
comps = { comp.get('cid'): dict(comp.items()) for comp in components }
|
||||||
|
|
||||||
restrs, forms = {}, {}
|
restrs, forms = {}, {}
|
||||||
|
@ -724,7 +728,8 @@ class SyntacticStructure:
|
||||||
elif el.tag.startswith("representation"):
|
elif el.tag.startswith("representation"):
|
||||||
st.add_representation(n, el, forms)
|
st.add_representation(n, el, forms)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Unknown definition: {} in structure {}".format(el.tag, st.id))
|
raise NotImplementedError("Unknown definition: {} in structure {}"
|
||||||
|
.format(el.tag, st.id))
|
||||||
|
|
||||||
fake_root_component = Component({'cid': '#', 'type': 'other'})
|
fake_root_component = Component({'cid': '#', 'type': 'other'})
|
||||||
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
|
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
|
||||||
|
@ -1023,7 +1028,8 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
|
||||||
class Writer:
|
class Writer:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_output_writer(args):
|
def make_output_writer(args):
|
||||||
return Writer(False, args.output, args.multiple_output, int(args.sort_by), args.sort_reversed)
|
return Writer(False, args.output, args.multiple_output,
|
||||||
|
int(args.sort_by), args.sort_reversed)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_all_writer(args):
|
def make_all_writer(args):
|
||||||
|
@ -1202,7 +1208,8 @@ class ColocationIds:
|
||||||
components_dict = {structure.id: structure for structure in structures}
|
components_dict = {structure.id: structure for structure in structures}
|
||||||
idx = 1
|
idx = 1
|
||||||
for _1, sm in tqdm(self.data.items()):
|
for _1, sm in tqdm(self.data.items()):
|
||||||
ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
|
ComponentRendition.set_representations(
|
||||||
|
sm, components_dict[sm.structure_id], word_renderer)
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
||||||
def determine_colocation_dispersions(self):
|
def determine_colocation_dispersions(self):
|
||||||
|
@ -1257,7 +1264,8 @@ def main(input_file, structures_file, args):
|
||||||
del cmd[pidx]
|
del cmd[pidx]
|
||||||
|
|
||||||
def func(n):
|
def func(n):
|
||||||
cmdn = [sys.executable] + cmd + [args.input[n], "--match-to-file", "{}/{}.p".format(tmpdirname, n)]
|
cmdn = [sys.executable] + cmd + [args.input[n],
|
||||||
|
"--match-to-file", "{}/{}.p".format(tmpdirname, n)]
|
||||||
subprocess.check_call(cmdn)
|
subprocess.check_call(cmdn)
|
||||||
return n
|
return n
|
||||||
|
|
||||||
|
@ -1296,24 +1304,43 @@ def main(input_file, structures_file, args):
|
||||||
Writer.make_all_writer(args).write_out(structures, colocation_ids)
|
Writer.make_all_writer(args).write_out(structures, colocation_ids)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument('structures', help='Structures definitions in xml file')
|
description='Extract structures from a parsed corpus.')
|
||||||
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
|
parser.add_argument('structures',
|
||||||
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
|
help='Structures definitions in xml file')
|
||||||
parser.add_argument('--all', help='Additional output file, writes more data')
|
parser.add_argument('input',
|
||||||
|
help='input xml file in `ssj500k form`, can list more than one', nargs='+')
|
||||||
|
parser.add_argument('--output',
|
||||||
|
help='Output file (if none given, then output to stdout)')
|
||||||
|
parser.add_argument('--all',
|
||||||
|
help='Additional output file, writes more data')
|
||||||
|
|
||||||
parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
|
parser.add_argument('--no-msd-translate',
|
||||||
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
|
help='MSDs are translated from slovene to english by default',
|
||||||
parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?')
|
action='store_true')
|
||||||
parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?')
|
parser.add_argument('--skip-id-check',
|
||||||
parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
|
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
|
||||||
parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')
|
action='store_true')
|
||||||
|
parser.add_argument('--min_freq', help='Minimal frequency in output',
|
||||||
|
type=int, default=0, const=1, nargs='?')
|
||||||
|
parser.add_argument('--verbose', help='Enable verbose output to stderr',
|
||||||
|
choices=["warning", "info", "debug"], default="info",
|
||||||
|
const="info", nargs='?')
|
||||||
|
parser.add_argument('--count-files',
|
||||||
|
help="Count files: more verbose output", action='store_true')
|
||||||
|
parser.add_argument('--multiple-output',
|
||||||
|
help='Generate one output for each syntactic structure',
|
||||||
|
action='store_true')
|
||||||
|
|
||||||
parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1)
|
parser.add_argument('--sort-by',
|
||||||
parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true')
|
help="Sort by a this column (index)", type=int, default=-1)
|
||||||
|
parser.add_argument('--sort-reversed',
|
||||||
|
help="Sort in reversed ored", action='store_true')
|
||||||
|
|
||||||
parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc")
|
parser.add_argument('--pc-tag',
|
||||||
parser.add_argument('--parallel', help='Run in multiple processes, should speed things up')
|
help='Tag for separators, usually pc or c', default="pc")
|
||||||
|
parser.add_argument('--parallel',
|
||||||
|
help='Run in multiple processes, should speed things up')
|
||||||
parser.add_argument('--match-to-file', help='Do not use!')
|
parser.add_argument('--match-to-file', help='Do not use!')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -1322,5 +1349,3 @@ if __name__ == '__main__':
|
||||||
start = time.time()
|
start = time.time()
|
||||||
main(args.input, args.structures, args)
|
main(args.input, args.structures, args)
|
||||||
logging.info("TIME: {}".format(time.time() - start))
|
logging.info("TIME: {}".format(time.time() - start))
|
||||||
|
|
||||||
# 2876, 2945 type
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user