Fixed downloading and parsing of structures

Finish tommorow?
This commit is contained in:
Ozbolt Menegatti 2020-07-01 23:16:02 +02:00
parent 887eb37d20
commit f1f540e5da
4 changed files with 23 additions and 19 deletions

View File

@ -9,11 +9,12 @@ if [[ -z "${API_KEY}" ]]; then
fi
OUT_FILE=conversions.csv
TEMP_FILE=$(mktemp)
rm -f $OUT_FILE
rm -r $OUT_FILE
for fname in glagol pridevnik prislov samostalnik; do
curl -s "https://gitea.cjvt.si/api/v1/repos/redmine_projects/kolokacije/contents/resources/structure_conversions/$fname.csv?token=$API_KEY" |
python3 -c "import sys, json; print(json.load(sys.stdin)['content'])" | base64 -d >> $OUT_FILE
done
curl -s "https://gitea.cjvt.si/api/v1/repos/generic/data_admin/contents/resources/structure_conversions.csv?token=$API_KEY" -o $TEMP_FILE
echo "wc: $(wc $TEMP_FILE)"
cat $TEMP_FILE | python3 -c "import sys, json; print(json.load(sys.stdin)['content'])" | base64 -d > $OUT_FILE
rm -rf $TEMP_FILE

View File

@ -1,3 +1,5 @@
from browser import window
__pragma__ ('noanno')
__pragma__ ('js', """
var fs = require('fs');
@ -13,7 +15,7 @@ def build_structure_conversions():
global structure_conversions
structure_conversions = []
structure_conversions_raw = [line.split(",") for line in conversion_csv.split("\n")]
structure_conversions_raw = [line.split("|") for line in conversion_csv.split("\n")]
for line in structure_conversions_raw:
if min(len(line[0]), len(line[1])) == 0:
continue
@ -22,28 +24,29 @@ def build_structure_conversions():
if line[1] == "struktura":
continue
vfrom = "^" + line[0].replace("?", "\?").replace("%s", "([a-zA-Z螚ȎŠ-]+)") + "$"
vto = line[1].replace("<struktura>", "").replace("</struktura>", "").replace("%s", "$1").strip()
vto_name = line[2].strip()
vto_id = line[4].strip()
structure_conversions.append((__new__(RegExp(vfrom, 'u')),
vto))
if 0 in (len(vto_name), len(vto_id)):
continue
vfrom = "^" + line[0].replace("?", "\?").replace("%s", "([a-zA-Z螚ȎŠ-]+)") + "$"
structure_conversions.append((__new__(RegExp(vfrom, 'u')), vto_name, vto_id))
def convert_structure(structure):
if structure_conversions is None:
build_structure_conversions()
for vfrom, vto in structure_conversions:
for vfrom, vto_name, vto_id in structure_conversions:
match = structure.match(vfrom)
if match:
# we need to remove replace alias here as we want to use javascript's one
__pragma__('noalias', 'replace')
result = structure.replace(vfrom, vto).strip()
result = structure.replace(vfrom, vto_name).strip()
__pragma__('alias', 'replace', "py_replace")
# they said this also needs to be done - remove "-d$" from %s match
if len(match) > 1 and match[1].endswith("-d"):
result = result.replace(match[1], match[1][:-2])
return result
return result, vto_id
window.console.log("Unknown structure: ", structure)
return None

View File

@ -58,7 +58,7 @@ class SkeCollocation:
def __init__(self, data):
self.word = data.word
self.frequency = data.count
self.structure_name = convert_structure(data.gramrel)
self.structure_name, self.structure_id = convert_structure(data.gramrel)
self.other = {"score": data.score, "cm": data.cm}
@ -242,7 +242,7 @@ class SkeInsert(DataChgClickMessage):
new_collocation = Example()
new_collocation.inner = MultiwordExample()
new_collocation.inner.other_attributes["structureName"] = example.structure_name
new_collocation.inner.other_attributes["structure_id"] = example.structure_id
new_collocation.inner.other_attributes["logDice"] = example.other["score"]
new_collocation.inner.other_attributes["frequency"] = example.frequency
new_collocation.inner.type = "collocation"

View File

@ -31,7 +31,7 @@ class Example(Data):
example.edited = True
example.inner = MultiwordExample()
example.inner.cluster = ExampleClusters.first_empty_cluster()
example.inner.type = "type??"
example.inner.type = "grammaticalCombination"
empty_component = ComponentLexeme()
empty_component.role = "headword"