diff --git a/issue992/extract.py b/issue992/extract.py index 113d74c..8e159c8 100644 --- a/issue992/extract.py +++ b/issue992/extract.py @@ -1,37 +1,48 @@ +import argparse +import os import sys import tqdm good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"] -N1 = len(good_lemmas) -N2 = len(sys.argv) - 1 - -files_to_write = [open("polona/{}".format(l), 'w') for l in good_lemmas] - -for fidx, filename in enumerate(sys.argv[1:]): - with open(filename, 'r') as fp: - print("loading next...", end="", flush=True) - line = fp.readline() - lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell] - file_lines = fp.read().split("\n") - - for lidx, good_lemma in enumerate(good_lemmas): - spaces = " " * 20 if lidx == 0 else "" - print("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces), end="", flush=True) - - for line in file_lines: - if good_lemma not in line: - continue - - line_split = line.split(',') - for lemma_idx in lemma_rows: - lemma = line_split[lemma_idx] - if lemma == good_lemma: - print(line, file=files_to_write[lidx]) - break - -for fp in files_to_write: - fp.close() - - +def main(args): + filepaths = [os.path.join(args.input, fn) for fn in os.listdir(args.input)] + filepaths = sorted(filepaths, key=lambda x: int(x.split('.')[-1])) + N1 = len(good_lemmas) + N2 = len(filepaths) - 1 + + files_to_write = [open("output/{}".format(l), 'w') for l in good_lemmas] + + for fidx, filename in enumerate(filepaths): + with open(filename, 'r') as fp: + print("loading next...", end="", flush=True) + line = fp.readline() + lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell] + file_lines = fp.read().split("\n") + + for lidx, good_lemma in enumerate(good_lemmas): + spaces = " " * 20 if lidx == 0 else "" + print("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces), end="", flush=True) + + for line in file_lines: + if good_lemma not in line: + continue + + line_split = line.split(',') + for lemma_idx in lemma_rows: + lemma = line_split[lemma_idx] + if lemma == good_lemma: + print(line, file=files_to_write[lidx]) + break + + for fp in files_to_write: + fp.close() + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Extract structures from a parsed corpus.') + parser.add_argument('input', + help='Path to folder with files') + args = parser.parse_args() + main(args) diff --git a/issue992/files b/issue992/files deleted file mode 100644 index f9a6d63..0000000 --- a/issue992/files +++ /dev/null @@ -1,81 +0,0 @@ -../data/gf2filesres/izhod.csv.100 -../data/gf2filesres/izhod.csv.101 -../data/gf2filesres/izhod.csv.102 -../data/gf2filesres/izhod.csv.103 -../data/gf2filesres/izhod.csv.104 -../data/gf2filesres/izhod.csv.105 -../data/gf2filesres/izhod.csv.106 -../data/gf2filesres/izhod.csv.107 -../data/gf2filesres/izhod.csv.108 -../data/gf2filesres/izhod.csv.12 -../data/gf2filesres/izhod.csv.13 -../data/gf2filesres/izhod.csv.14 -../data/gf2filesres/izhod.csv.15 -../data/gf2filesres/izhod.csv.16 -../data/gf2filesres/izhod.csv.17 -../data/gf2filesres/izhod.csv.18 -../data/gf2filesres/izhod.csv.19 -../data/gf2filesres/izhod.csv.22 -../data/gf2filesres/izhod.csv.23 -../data/gf2filesres/izhod.csv.24 -../data/gf2filesres/izhod.csv.25 -../data/gf2filesres/izhod.csv.26 -../data/gf2filesres/izhod.csv.27 -../data/gf2filesres/izhod.csv.28 -../data/gf2filesres/izhod.csv.29 -../data/gf2filesres/izhod.csv.30 -../data/gf2filesres/izhod.csv.31 -../data/gf2filesres/izhod.csv.32 -../data/gf2filesres/izhod.csv.34 -../data/gf2filesres/izhod.csv.35 -../data/gf2filesres/izhod.csv.36 -../data/gf2filesres/izhod.csv.37 -../data/gf2filesres/izhod.csv.38 -../data/gf2filesres/izhod.csv.39 -../data/gf2filesres/izhod.csv.40 -../data/gf2filesres/izhod.csv.41 -../data/gf2filesres/izhod.csv.42 -../data/gf2filesres/izhod.csv.43 -../data/gf2filesres/izhod.csv.44 -../data/gf2filesres/izhod.csv.45 -../data/gf2filesres/izhod.csv.46 -../data/gf2filesres/izhod.csv.47 -../data/gf2filesres/izhod.csv.48 -../data/gf2filesres/izhod.csv.49 -../data/gf2filesres/izhod.csv.50 -../data/gf2filesres/izhod.csv.51 -../data/gf2filesres/izhod.csv.52 -../data/gf2filesres/izhod.csv.53 -../data/gf2filesres/izhod.csv.54 -../data/gf2filesres/izhod.csv.55 -../data/gf2filesres/izhod.csv.57 -../data/gf2filesres/izhod.csv.68 -../data/gf2filesres/izhod.csv.69 -../data/gf2filesres/izhod.csv.70 -../data/gf2filesres/izhod.csv.71 -../data/gf2filesres/izhod.csv.72 -../data/gf2filesres/izhod.csv.73 -../data/gf2filesres/izhod.csv.74 -../data/gf2filesres/izhod.csv.75 -../data/gf2filesres/izhod.csv.76 -../data/gf2filesres/izhod.csv.77 -../data/gf2filesres/izhod.csv.78 -../data/gf2filesres/izhod.csv.80 -../data/gf2filesres/izhod.csv.81 -../data/gf2filesres/izhod.csv.82 -../data/gf2filesres/izhod.csv.83 -../data/gf2filesres/izhod.csv.84 -../data/gf2filesres/izhod.csv.85 -../data/gf2filesres/izhod.csv.86 -../data/gf2filesres/izhod.csv.87 -../data/gf2filesres/izhod.csv.88 -../data/gf2filesres/izhod.csv.89 -../data/gf2filesres/izhod.csv.90 -../data/gf2filesres/izhod.csv.91 -../data/gf2filesres/izhod.csv.92 -../data/gf2filesres/izhod.csv.93 -../data/gf2filesres/izhod.csv.94 -../data/gf2filesres/izhod.csv.95 -../data/gf2filesres/izhod.csv.96 -../data/gf2filesres/izhod.csv.97 -../data/gf2filesres/izhod.csv.98 diff --git a/luscenje_struktur/component.py b/luscenje_struktur/component.py index b5ce25a..a0a7668 100644 --- a/luscenje_struktur/component.py +++ b/luscenje_struktur/component.py @@ -21,7 +21,7 @@ class ComponentType(Enum): class Component: def __init__(self, info): idx = info['cid'] - name = info['name'] if 'name' in info else None + name = info['label'] if 'label' in info else None typ = ComponentType.Core if info['type'] == "core" else ComponentType.Other if 'status' not in info: diff --git a/luscenje_struktur/representation_assigner.py b/luscenje_struktur/representation_assigner.py index 88eed7f..0fde91d 100644 --- a/luscenje_struktur/representation_assigner.py +++ b/luscenje_struktur/representation_assigner.py @@ -27,11 +27,10 @@ class RepresentationAssigner: elif feature['selection'] == "all": self.representation_factory = WordFormAllCR elif feature['selection'] == 'agreement': - assert feature['head'][:4] == 'cid_' assert feature['msd'] is not None self.representation_factory = WordFormAgreementCR self.more['agreement'] = feature['msd'].split('+') - self.more['other'] = feature['head'][4:] + self.more['other'] = feature['head_cid'] else: raise NotImplementedError("Representation selection: {}".format(feature)) diff --git a/luscenje_struktur/syntactic_structure.py b/luscenje_struktur/syntactic_structure.py index 8734d43..3f06158 100644 --- a/luscenje_struktur/syntactic_structure.py +++ b/luscenje_struktur/syntactic_structure.py @@ -8,15 +8,17 @@ from luscenje_struktur.lemma_features import get_lemma_features class SyntacticStructure: def __init__(self): self.id = None - self.lbs = None + # self.lbs = None self.components = [] self.fake_root_included = False @staticmethod def from_xml(xml, no_stats): st = SyntacticStructure() - st.id = xml.get('id_nsss') - st.lbs = xml.get('LBS') + st.id = xml.get('id') + if st.id is None: + st.id = xml.get('tempId') + # st.lbs = xml.get('LBS') assert len(list(xml)) == 1 system = next(iter(xml))