Scripts adapted to changes of new structures.xml format
This commit is contained in:
parent
09c4277ebe
commit
8c87d07b8a
|
@ -1,37 +1,48 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tqdm
|
import tqdm
|
||||||
|
|
||||||
good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"]
|
good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"]
|
||||||
|
|
||||||
N1 = len(good_lemmas)
|
def main(args):
|
||||||
N2 = len(sys.argv) - 1
|
filepaths = [os.path.join(args.input, fn) for fn in os.listdir(args.input)]
|
||||||
|
filepaths = sorted(filepaths, key=lambda x: int(x.split('.')[-1]))
|
||||||
|
N1 = len(good_lemmas)
|
||||||
|
N2 = len(filepaths) - 1
|
||||||
|
|
||||||
files_to_write = [open("polona/{}".format(l), 'w') for l in good_lemmas]
|
files_to_write = [open("output/{}".format(l), 'w') for l in good_lemmas]
|
||||||
|
|
||||||
for fidx, filename in enumerate(sys.argv[1:]):
|
for fidx, filename in enumerate(filepaths):
|
||||||
with open(filename, 'r') as fp:
|
with open(filename, 'r') as fp:
|
||||||
print("loading next...", end="", flush=True)
|
print("loading next...", end="", flush=True)
|
||||||
line = fp.readline()
|
line = fp.readline()
|
||||||
lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell]
|
lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell]
|
||||||
file_lines = fp.read().split("\n")
|
file_lines = fp.read().split("\n")
|
||||||
|
|
||||||
for lidx, good_lemma in enumerate(good_lemmas):
|
for lidx, good_lemma in enumerate(good_lemmas):
|
||||||
spaces = " " * 20 if lidx == 0 else ""
|
spaces = " " * 20 if lidx == 0 else ""
|
||||||
print("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces), end="", flush=True)
|
print("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces), end="", flush=True)
|
||||||
|
|
||||||
for line in file_lines:
|
for line in file_lines:
|
||||||
if good_lemma not in line:
|
if good_lemma not in line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
line_split = line.split(',')
|
line_split = line.split(',')
|
||||||
for lemma_idx in lemma_rows:
|
for lemma_idx in lemma_rows:
|
||||||
lemma = line_split[lemma_idx]
|
lemma = line_split[lemma_idx]
|
||||||
if lemma == good_lemma:
|
if lemma == good_lemma:
|
||||||
print(line, file=files_to_write[lidx])
|
print(line, file=files_to_write[lidx])
|
||||||
break
|
break
|
||||||
|
|
||||||
for fp in files_to_write:
|
|
||||||
fp.close()
|
|
||||||
|
|
||||||
|
for fp in files_to_write:
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Extract structures from a parsed corpus.')
|
||||||
|
parser.add_argument('input',
|
||||||
|
help='Path to folder with files')
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
|
|
||||||
|
|
|
@ -1,81 +0,0 @@
|
||||||
../data/gf2filesres/izhod.csv.100
|
|
||||||
../data/gf2filesres/izhod.csv.101
|
|
||||||
../data/gf2filesres/izhod.csv.102
|
|
||||||
../data/gf2filesres/izhod.csv.103
|
|
||||||
../data/gf2filesres/izhod.csv.104
|
|
||||||
../data/gf2filesres/izhod.csv.105
|
|
||||||
../data/gf2filesres/izhod.csv.106
|
|
||||||
../data/gf2filesres/izhod.csv.107
|
|
||||||
../data/gf2filesres/izhod.csv.108
|
|
||||||
../data/gf2filesres/izhod.csv.12
|
|
||||||
../data/gf2filesres/izhod.csv.13
|
|
||||||
../data/gf2filesres/izhod.csv.14
|
|
||||||
../data/gf2filesres/izhod.csv.15
|
|
||||||
../data/gf2filesres/izhod.csv.16
|
|
||||||
../data/gf2filesres/izhod.csv.17
|
|
||||||
../data/gf2filesres/izhod.csv.18
|
|
||||||
../data/gf2filesres/izhod.csv.19
|
|
||||||
../data/gf2filesres/izhod.csv.22
|
|
||||||
../data/gf2filesres/izhod.csv.23
|
|
||||||
../data/gf2filesres/izhod.csv.24
|
|
||||||
../data/gf2filesres/izhod.csv.25
|
|
||||||
../data/gf2filesres/izhod.csv.26
|
|
||||||
../data/gf2filesres/izhod.csv.27
|
|
||||||
../data/gf2filesres/izhod.csv.28
|
|
||||||
../data/gf2filesres/izhod.csv.29
|
|
||||||
../data/gf2filesres/izhod.csv.30
|
|
||||||
../data/gf2filesres/izhod.csv.31
|
|
||||||
../data/gf2filesres/izhod.csv.32
|
|
||||||
../data/gf2filesres/izhod.csv.34
|
|
||||||
../data/gf2filesres/izhod.csv.35
|
|
||||||
../data/gf2filesres/izhod.csv.36
|
|
||||||
../data/gf2filesres/izhod.csv.37
|
|
||||||
../data/gf2filesres/izhod.csv.38
|
|
||||||
../data/gf2filesres/izhod.csv.39
|
|
||||||
../data/gf2filesres/izhod.csv.40
|
|
||||||
../data/gf2filesres/izhod.csv.41
|
|
||||||
../data/gf2filesres/izhod.csv.42
|
|
||||||
../data/gf2filesres/izhod.csv.43
|
|
||||||
../data/gf2filesres/izhod.csv.44
|
|
||||||
../data/gf2filesres/izhod.csv.45
|
|
||||||
../data/gf2filesres/izhod.csv.46
|
|
||||||
../data/gf2filesres/izhod.csv.47
|
|
||||||
../data/gf2filesres/izhod.csv.48
|
|
||||||
../data/gf2filesres/izhod.csv.49
|
|
||||||
../data/gf2filesres/izhod.csv.50
|
|
||||||
../data/gf2filesres/izhod.csv.51
|
|
||||||
../data/gf2filesres/izhod.csv.52
|
|
||||||
../data/gf2filesres/izhod.csv.53
|
|
||||||
../data/gf2filesres/izhod.csv.54
|
|
||||||
../data/gf2filesres/izhod.csv.55
|
|
||||||
../data/gf2filesres/izhod.csv.57
|
|
||||||
../data/gf2filesres/izhod.csv.68
|
|
||||||
../data/gf2filesres/izhod.csv.69
|
|
||||||
../data/gf2filesres/izhod.csv.70
|
|
||||||
../data/gf2filesres/izhod.csv.71
|
|
||||||
../data/gf2filesres/izhod.csv.72
|
|
||||||
../data/gf2filesres/izhod.csv.73
|
|
||||||
../data/gf2filesres/izhod.csv.74
|
|
||||||
../data/gf2filesres/izhod.csv.75
|
|
||||||
../data/gf2filesres/izhod.csv.76
|
|
||||||
../data/gf2filesres/izhod.csv.77
|
|
||||||
../data/gf2filesres/izhod.csv.78
|
|
||||||
../data/gf2filesres/izhod.csv.80
|
|
||||||
../data/gf2filesres/izhod.csv.81
|
|
||||||
../data/gf2filesres/izhod.csv.82
|
|
||||||
../data/gf2filesres/izhod.csv.83
|
|
||||||
../data/gf2filesres/izhod.csv.84
|
|
||||||
../data/gf2filesres/izhod.csv.85
|
|
||||||
../data/gf2filesres/izhod.csv.86
|
|
||||||
../data/gf2filesres/izhod.csv.87
|
|
||||||
../data/gf2filesres/izhod.csv.88
|
|
||||||
../data/gf2filesres/izhod.csv.89
|
|
||||||
../data/gf2filesres/izhod.csv.90
|
|
||||||
../data/gf2filesres/izhod.csv.91
|
|
||||||
../data/gf2filesres/izhod.csv.92
|
|
||||||
../data/gf2filesres/izhod.csv.93
|
|
||||||
../data/gf2filesres/izhod.csv.94
|
|
||||||
../data/gf2filesres/izhod.csv.95
|
|
||||||
../data/gf2filesres/izhod.csv.96
|
|
||||||
../data/gf2filesres/izhod.csv.97
|
|
||||||
../data/gf2filesres/izhod.csv.98
|
|
|
@ -21,7 +21,7 @@ class ComponentType(Enum):
|
||||||
class Component:
|
class Component:
|
||||||
def __init__(self, info):
|
def __init__(self, info):
|
||||||
idx = info['cid']
|
idx = info['cid']
|
||||||
name = info['name'] if 'name' in info else None
|
name = info['label'] if 'label' in info else None
|
||||||
typ = ComponentType.Core if info['type'] == "core" else ComponentType.Other
|
typ = ComponentType.Core if info['type'] == "core" else ComponentType.Other
|
||||||
|
|
||||||
if 'status' not in info:
|
if 'status' not in info:
|
||||||
|
|
|
@ -27,11 +27,10 @@ class RepresentationAssigner:
|
||||||
elif feature['selection'] == "all":
|
elif feature['selection'] == "all":
|
||||||
self.representation_factory = WordFormAllCR
|
self.representation_factory = WordFormAllCR
|
||||||
elif feature['selection'] == 'agreement':
|
elif feature['selection'] == 'agreement':
|
||||||
assert feature['head'][:4] == 'cid_'
|
|
||||||
assert feature['msd'] is not None
|
assert feature['msd'] is not None
|
||||||
self.representation_factory = WordFormAgreementCR
|
self.representation_factory = WordFormAgreementCR
|
||||||
self.more['agreement'] = feature['msd'].split('+')
|
self.more['agreement'] = feature['msd'].split('+')
|
||||||
self.more['other'] = feature['head'][4:]
|
self.more['other'] = feature['head_cid']
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Representation selection: {}".format(feature))
|
raise NotImplementedError("Representation selection: {}".format(feature))
|
||||||
|
|
||||||
|
|
|
@ -8,15 +8,17 @@ from luscenje_struktur.lemma_features import get_lemma_features
|
||||||
class SyntacticStructure:
|
class SyntacticStructure:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.id = None
|
self.id = None
|
||||||
self.lbs = None
|
# self.lbs = None
|
||||||
self.components = []
|
self.components = []
|
||||||
self.fake_root_included = False
|
self.fake_root_included = False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_xml(xml, no_stats):
|
def from_xml(xml, no_stats):
|
||||||
st = SyntacticStructure()
|
st = SyntacticStructure()
|
||||||
st.id = xml.get('id_nsss')
|
st.id = xml.get('id')
|
||||||
st.lbs = xml.get('LBS')
|
if st.id is None:
|
||||||
|
st.id = xml.get('tempId')
|
||||||
|
# st.lbs = xml.get('LBS')
|
||||||
|
|
||||||
assert len(list(xml)) == 1
|
assert len(list(xml)) == 1
|
||||||
system = next(iter(xml))
|
system = next(iter(xml))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user