forked from kristjan/cjvt-srl-tagging
msdmap.py
This commit is contained in:
parent
d1ba56be37
commit
3bf2fa609f
180
data_format.xml
Normal file
180
data_format.xml
Normal file
|
@ -0,0 +1,180 @@
|
|||
# sl.test.mate
|
||||
Task: parse kres data into a input format for the tagger.
|
||||
Conll 2009 format: https://wiki.ufal.ms.mff.cuni.cz/format-conll
|
||||
|
||||
id form lemma plemma pos ppos feat pfeat head phead deprel pdeprel fillpred pred apred1 apred2 apred3 ...
|
||||
|
||||
|
||||
1 Prispelo prispeti prispeti V V Verb|main|perfective|participle|singular|neuter Verb|main|perfective|participle|singular|neuter 0 0 modra modra Y prispeti _ _ _ _ _ _ _ _ _
|
||||
2 je biti biti V V Verb|auxiliary|present|third|singular|-Negative Verb|auxiliary|present|third|singular|-Negative 1 1 del del _ _ _ _ _ _ _ _ _ _ _
|
||||
3 skoraj skoraj skoraj L L Particle Particle 4 4 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
4 60 60 60 K K Numeral|digit|cardinal Numeral|digit|cardinal 5 5 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
5 izdelkov izdelek izdelek S S Noun|common|masculine|plural|genitive Noun|common|masculine|plural|genitive 1 1 ena ena _ _ ACT _ _ _ _ _ _ _ _
|
||||
6 osnovnošolcev osnovnošolec osnovnošolec S S Noun|common|masculine|plural|genitive Noun|common|masculine|plural|genitive 5 5 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
7 iz iz iz D D Adposition|genitive Adposition|genitive 9 9 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
8 12 12 12 K K Numeral|digit|cardinal Numeral|digit|cardinal 9 9 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
9 šol šola šola S S Noun|common|feminine|plural|genitive Noun|common|feminine|plural|genitive 6 6 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
10 širšega širok širok P P Adjective|general|comparative|neuter|singular|genitive Adjective|general|comparative|neuter|singular|genitive 12 12 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
11 ptujskega ptujski ptujski P P Adjective|general|positive|neuter|singular|genitive Adjective|general|positive|neuter|singular|genitive 12 12 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
12 območja območje območje S S Noun|common|neuter|singular|genitive Noun|common|neuter|singular|genitive 9 9 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
13 . . . . . . . 0 0 modra modra _ _ _ _ _ _ _ _ _ _ _
|
||||
|
||||
<s xml:id="ssj187.1237.4493">
|
||||
<w ana="msd:Ggdd-es" lemma="prispeti" xml:id="ssj187.1237.4493.t1">Prispelo</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Gp-ste-n" lemma="biti" xml:id="ssj187.1237.4493.t2">je</w>
|
||||
<c> </c>
|
||||
<w ana="msd:L" lemma="skoraj" xml:id="ssj187.1237.4493.t3">skoraj</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Kag" lemma="60" xml:id="ssj187.1237.4493.t4">60</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sommr" lemma="izdelek" xml:id="ssj187.1237.4493.t5">izdelkov</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sommr" lemma="osnovnošolec" xml:id="ssj187.1237.4493.t6">osnovnošolcev</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Dr" lemma="iz" xml:id="ssj187.1237.4493.t7">iz</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Kag" lemma="12" xml:id="ssj187.1237.4493.t8">12</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sozmr" lemma="šola" xml:id="ssj187.1237.4493.t9">šol</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Pppser" lemma="širok" xml:id="ssj187.1237.4493.t10">širšega</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Ppnser" lemma="ptujski" xml:id="ssj187.1237.4493.t11">ptujskega</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Soser" lemma="območje" xml:id="ssj187.1237.4493.t12">območja</w>
|
||||
<pc ana="msd:U" xml:id="ssj187.1237.4493.t13">.</pc>
|
||||
<linkGrp corresp="#ssj187.1237.4493" targFunc="head argument" type="syntax">
|
||||
<link ana="syn:modra" target="#ssj187.1237.4493 #ssj187.1237.4493.t1"/>
|
||||
<link ana="syn:del" target="#ssj187.1237.4493.t1 #ssj187.1237.4493.t2"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t4 #ssj187.1237.4493.t3"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t5 #ssj187.1237.4493.t4"/>
|
||||
<link ana="syn:ena" target="#ssj187.1237.4493.t1 #ssj187.1237.4493.t5"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t5 #ssj187.1237.4493.t6"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t9 #ssj187.1237.4493.t7"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t9 #ssj187.1237.4493.t8"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t6 #ssj187.1237.4493.t9"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t12 #ssj187.1237.4493.t10"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t12 #ssj187.1237.4493.t11"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4493.t9 #ssj187.1237.4493.t12"/>
|
||||
<link ana="syn:modra" target="#ssj187.1237.4493 #ssj187.1237.4493.t13"/>
|
||||
</linkGrp>
|
||||
<linkGrp corresp="#ssj187.1237.4493" targFunc="head argument" type="SRL">
|
||||
<link ana="srl:ACT" target="#ssj187.1237.4493.t1 #ssj187.1237.4493.t5"/>
|
||||
</linkGrp>
|
||||
</s>
|
||||
|
||||
|
||||
|
||||
1 Izbrana izbran izbran P P Adjective|participle|positive|neuter|plural|accusative Adjective|participle|positive|neuter|plural|accusative 2 2 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
2 dela delo delo S S Noun|common|neuter|plural|accusative Noun|common|neuter|plural|accusative 4 4 dve dve _ _ PAT _ _ _ _ _ _ _ _
|
||||
3 so biti biti V V Verb|auxiliary|present|third|plural|-Negative Verb|auxiliary|present|third|plural|-Negative 4 4 del del _ _ _ _ _ _ _ _ _ _ _
|
||||
4 razstavili razstaviti razstaviti V V Verb|main|perfective|participle|plural|masculine Verb|main|perfective|participle|plural|masculine 0 0 modra modra Y razstaviti _ _ _ _ _ _ _ _ _
|
||||
5 v v v D D Adposition|locative Adposition|locative 7 7 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
6 vhodni vhoden vhoden P P Adjective|general|positive|feminine|singular|locative Adjective|general|positive|feminine|singular|locative 7 7 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
7 avli avla avla S S Noun|common|feminine|singular|locative Noun|common|feminine|singular|locative 4 4 štiri štiri _ _ LOC _ _ _ _ _ _ _ _
|
||||
8 upravne upraven upraven P P Adjective|general|positive|feminine|singular|genitive Adjective|general|positive|feminine|singular|genitive 9 9 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
9 stavbe stavba stavba S S Noun|common|feminine|singular|genitive Noun|common|feminine|singular|genitive 7 7 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
10 PP PP PP S S Noun|proper|feminine|singular|genitive Noun|proper|feminine|singular|genitive 9 9 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
11 in in in V V Conjunction|coordinating Conjunction|coordinating 17 17 vez vez _ _ _ _ _ _ _ _ _ _ _
|
||||
12 tja tja tja R R Adverb|general|positive Adverb|general|positive 17 17 štiri štiri _ _ _ GOAL _ _ _ _ _ _ _
|
||||
13 konec konec konec D D Adposition|genitive Adposition|genitive 14 14 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
14 novembra november november S S Noun|common|masculine|singular|genitive Noun|common|masculine|singular|genitive 17 17 štiri štiri _ _ _ TIME _ _ _ _ _ _ _
|
||||
15 na na na D D Adposition|accusative Adposition|accusative 16 16 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
16 otvoritev otvoritev otvoritev S S Noun|common|feminine|singular|accusative Noun|common|feminine|singular|accusative 17 17 štiri štiri _ _ _ AIM _ _ _ _ _ _ _
|
||||
17 povabili povabiti povabiti V V Verb|main|perfective|participle|plural|masculine Verb|main|perfective|participle|plural|masculine 0 0 modra modra Y povabiti _ _ _ _ _ _ _ _ _
|
||||
18 male mali mali P P Adjective|general|positive|masculine|plural|accusative Adjective|general|positive|masculine|plural|accusative 19 19 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
19 risarje risar risar S S Noun|common|masculine|plural|accusative Noun|common|masculine|plural|accusative 17 17 dve dve _ _ _ PAT _ _ _ _ _ _ _
|
||||
20 , , , , , , , 22 22 vez vez _ _ _ _ _ _ _ _ _ _ _
|
||||
21 njihove njihov njihov Z Z Pronoun|possessive|third|masculine|plural|accusative|plural Pronoun|possessive|third|masculine|plural|accusative|plural 22 22 dol dol _ _ _ _ _ _ _ _ _ _ _
|
||||
22 starše starš starš S S Noun|common|masculine|plural|accusative Noun|common|masculine|plural|accusative 19 19 prir prir _ _ _ _ _ _ _ _ _ _ _
|
||||
23 in in in V V Conjunction|coordinating Conjunction|coordinating 24 24 vez vez _ _ _ _ _ _ _ _ _ _ _
|
||||
24 učitelje učitelj učitelj S S Noun|common|masculine|plural|accusative Noun|common|masculine|plural|accusative 19 19 prir prir _ _ _ _ _ _ _ _ _ _ _
|
||||
25 . . . . . . . 0 0 modra modra _ _ _ _ _ _ _ _ _ _ _
|
||||
|
||||
|
||||
<s xml:id="ssj187.1237.4495">
|
||||
<w ana="msd:Pdnsmt" lemma="izbran" xml:id="ssj187.1237.4495.t1">Izbrana</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sosmt" lemma="delo" xml:id="ssj187.1237.4495.t2">dela</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Gp-stm-n" lemma="biti" xml:id="ssj187.1237.4495.t3">so</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Ggdd-mm" lemma="razstaviti" xml:id="ssj187.1237.4495.t4">razstavili</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Dm" lemma="v" xml:id="ssj187.1237.4495.t5">v</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Ppnzem" lemma="vhoden" xml:id="ssj187.1237.4495.t6">vhodni</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sozem" lemma="avla" xml:id="ssj187.1237.4495.t7">avli</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Ppnzer" lemma="upraven" xml:id="ssj187.1237.4495.t8">upravne</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sozer" lemma="stavba" xml:id="ssj187.1237.4495.t9">stavbe</w>
|
||||
<c> </c>
|
||||
<seg subtype="org" type="name">
|
||||
<w ana="msd:Slzer" lemma="PP" xml:id="ssj187.1237.4495.t10">PP</w>
|
||||
</seg>
|
||||
<c> </c>
|
||||
<w ana="msd:Vp" lemma="in" xml:id="ssj187.1237.4495.t11">in</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Rsn" lemma="tja" xml:id="ssj187.1237.4495.t12">tja</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Dr" lemma="konec" xml:id="ssj187.1237.4495.t13">konec</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Somer" lemma="november" xml:id="ssj187.1237.4495.t14">novembra</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Dt" lemma="na" xml:id="ssj187.1237.4495.t15">na</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sozet" lemma="otvoritev" xml:id="ssj187.1237.4495.t16">otvoritev</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Ggdd-mm" lemma="povabiti" xml:id="ssj187.1237.4495.t17">povabili</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Ppnmmt" lemma="mali" xml:id="ssj187.1237.4495.t18">male</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sommt" lemma="risar" xml:id="ssj187.1237.4495.t19">risarje</w>
|
||||
<pc ana="msd:U" xml:id="ssj187.1237.4495.t20">,</pc>
|
||||
<c> </c>
|
||||
<w ana="msd:Zstmmtm" lemma="njihov" xml:id="ssj187.1237.4495.t21">njihove</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sommt" lemma="starš" xml:id="ssj187.1237.4495.t22">starše</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Vp" lemma="in" xml:id="ssj187.1237.4495.t23">in</w>
|
||||
<c> </c>
|
||||
<w ana="msd:Sommt" lemma="učitelj" xml:id="ssj187.1237.4495.t24">učitelje</w>
|
||||
<pc ana="msd:U" xml:id="ssj187.1237.4495.t25">.</pc>
|
||||
<linkGrp corresp="#ssj187.1237.4495" targFunc="head argument" type="syntax">
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t2 #ssj187.1237.4495.t1"/>
|
||||
<link ana="syn:dve" target="#ssj187.1237.4495.t4 #ssj187.1237.4495.t2"/>
|
||||
<link ana="syn:del" target="#ssj187.1237.4495.t4 #ssj187.1237.4495.t3"/>
|
||||
<link ana="syn:modra" target="#ssj187.1237.4495 #ssj187.1237.4495.t4"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t7 #ssj187.1237.4495.t5"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t7 #ssj187.1237.4495.t6"/>
|
||||
<link ana="syn:štiri" target="#ssj187.1237.4495.t4 #ssj187.1237.4495.t7"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t9 #ssj187.1237.4495.t8"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t7 #ssj187.1237.4495.t9"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t9 #ssj187.1237.4495.t10"/>
|
||||
<link ana="syn:vez" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t11"/>
|
||||
<link ana="syn:štiri" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t12"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t14 #ssj187.1237.4495.t13"/>
|
||||
<link ana="syn:štiri" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t14"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t16 #ssj187.1237.4495.t15"/>
|
||||
<link ana="syn:štiri" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t16"/>
|
||||
<link ana="syn:modra" target="#ssj187.1237.4495 #ssj187.1237.4495.t17"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t19 #ssj187.1237.4495.t18"/>
|
||||
<link ana="syn:dve" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t19"/>
|
||||
<link ana="syn:vez" target="#ssj187.1237.4495.t22 #ssj187.1237.4495.t20"/>
|
||||
<link ana="syn:dol" target="#ssj187.1237.4495.t22 #ssj187.1237.4495.t21"/>
|
||||
<link ana="syn:prir" target="#ssj187.1237.4495.t19 #ssj187.1237.4495.t22"/>
|
||||
<link ana="syn:vez" target="#ssj187.1237.4495.t24 #ssj187.1237.4495.t23"/>
|
||||
<link ana="syn:prir" target="#ssj187.1237.4495.t19 #ssj187.1237.4495.t24"/>
|
||||
<link ana="syn:modra" target="#ssj187.1237.4495 #ssj187.1237.4495.t25"/>
|
||||
</linkGrp>
|
||||
<linkGrp corresp="#ssj187.1237.4495" targFunc="head argument" type="SRL">
|
||||
<link ana="srl:AIM" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t16"/>
|
||||
<link ana="srl:GOAL" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t12"/>
|
||||
<link ana="srl:LOC" target="#ssj187.1237.4495.t4 #ssj187.1237.4495.t7"/>
|
||||
<link ana="srl:PAT" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t19"/>
|
||||
<link ana="srl:PAT" target="#ssj187.1237.4495.t4 #ssj187.1237.4495.t2"/>
|
||||
<link ana="srl:TIME" target="#ssj187.1237.4495.t17 #ssj187.1237.4495.t14"/>
|
||||
</linkGrp>
|
311
tools/parser/msdmap.py
Normal file
311
tools/parser/msdmap.py
Normal file
|
@ -0,0 +1,311 @@
|
|||
# msd mappings from slo to ang
|
||||
class Msdmap():
|
||||
def __init__(self):
|
||||
# http://nl.ijs.si/ME/V4/msd/html/msd.categories-sl.html
|
||||
self.pos_slo_ang = [
|
||||
("samostalnik", "S", "Noun", "N"),
|
||||
("glagol", "G", "Verb", "V"),
|
||||
("pridevnik", "P", "Adjective", "A"),
|
||||
("prislov", "R", "Adverb", "R"),
|
||||
("zaimek", "Z", "Pronoun", "P"),
|
||||
("števnik", "K", "Numeral", "M"),
|
||||
("predlog", "D", "Preposition", "S"),
|
||||
("veznik", "V", "Conjunction", "C"),
|
||||
("členek", "L", "Particle", "Q"),
|
||||
("medmet", "M", "Interjection", "I"),
|
||||
("okrajšava", "O", "Abbreviation", "Y"),
|
||||
("neuvrščeno", "N", "Residual", "X"),
|
||||
]
|
||||
|
||||
# http://nl.ijs.si/ME/V4/msd/html/msd.values-sl.html
|
||||
# col:
|
||||
# (Value (sl), Code (sl), Attribute (sl), Category (sl),
|
||||
# Value (en), Code (en), Attribute (en), Category (en))
|
||||
self.pos_val = [
|
||||
("arabski", "a", "zapis", "števnik",
|
||||
"digit", "d", "Form", "Numeral"),
|
||||
("besedni", "b", "zapis", "števnik",
|
||||
"letter", "l", "Form", "Numeral"),
|
||||
("celostni", "c", "vrsta", "zaimek",
|
||||
"general", "g", "Type", "Pronoun"),
|
||||
("da", "d", "določnost", "pridevnik",
|
||||
"yes", "y", "Definiteness", "Adjective"),
|
||||
("da", "d", "določnost", "števnik",
|
||||
"yes", "y", "Definiteness", "Numeral"),
|
||||
("da", "d", "živost", "samostalnik",
|
||||
"yes", "y", "Animate", "Noun"),
|
||||
("dajalnik", "d", "sklon", "predlog",
|
||||
"dative", "d", "Case", "Adposition"),
|
||||
("dajalnik", "d", "sklon", "pridevnik",
|
||||
"dative", "d", "Case", "Adjective"),
|
||||
("dajalnik", "d", "sklon", "samostalnik",
|
||||
"dative", "d", "Case", "Noun"),
|
||||
("dajalnik", "d", "sklon", "zaimek",
|
||||
"dative", "d", "Case", "Pronoun"),
|
||||
("dajalnik", "d", "sklon", "števnik",
|
||||
"dative", "d", "Case", "Numeral"),
|
||||
("deležje", "d", "vrsta", "prislov",
|
||||
"participle", "r", "Type", "Adverb"),
|
||||
("deležnik", "d", "oblika", "glagol",
|
||||
"participle", "p", "VForm", "Verb"),
|
||||
("deležniški", "d", "vrsta", "pridevnik",
|
||||
"participle", "p", "Type", "Adjective"),
|
||||
("dovršni", "d", "vid", "glagol",
|
||||
"perfective", "e", "Aspect", "Verb"),
|
||||
("druga", "d", "oseba", "glagol",
|
||||
"second", "2", "Person", "Verb"),
|
||||
("druga", "d", "oseba", "zaimek",
|
||||
"second", "2", "Person", "Pronoun"),
|
||||
("drugi", "d", "vrsta", "števnik",
|
||||
"special", "s", "Type", "Numeral"),
|
||||
("dvojina", "d", "število", "glagol",
|
||||
"dual", "d", "Number", "Verb"),
|
||||
("dvojina", "d", "število", "pridevnik",
|
||||
"dual", "d", "Number", "Adjective"),
|
||||
("dvojina", "d", "število", "samostalnik",
|
||||
"dual", "d", "Number", "Noun"),
|
||||
("dvojina", "d", "število", "zaimek",
|
||||
"dual", "d", "Number", "Pronoun"),
|
||||
("dvojina", "d", "število", "števnik",
|
||||
"dual", "d", "Number", "Numeral"),
|
||||
("dvojina", "d", "število_svojine", "zaimek",
|
||||
"dual", "d", "Owner_Number", "Pronoun"),
|
||||
("dvovidski", "v", "vid", "glagol",
|
||||
"biaspectual", "b", "Aspect", "Verb"),
|
||||
("ednina", "e", "število", "glagol",
|
||||
"singular", "s", "Number", "Verb"),
|
||||
("ednina", "e", "število", "pridevnik",
|
||||
"singular", "s", "Number", "Adjective"),
|
||||
("ednina", "e", "število", "samostalnik",
|
||||
"singular", "s", "Number", "Noun"),
|
||||
("ednina", "e", "število", "zaimek",
|
||||
"singular", "s", "Number", "Pronoun"),
|
||||
("ednina", "e", "število", "števnik",
|
||||
"singular", "s", "Number", "Numeral"),
|
||||
("ednina", "e", "število_svojine", "zaimek",
|
||||
"singular", "s", "Owner_Number", "Pronoun"),
|
||||
("glavni", "g", "vrsta", "glagol",
|
||||
"main", "m", "Type", "Verb"),
|
||||
("glavni", "g", "vrsta", "števnik",
|
||||
"cardinal", "c", "Type", "Numeral"),
|
||||
("imenovalnik", "i", "sklon", "predlog",
|
||||
"nominative", "n", "Case", "Adposition"),
|
||||
("imenovalnik", "i", "sklon", "pridevnik",
|
||||
"nominative", "n", "Case", "Adjective"),
|
||||
("imenovalnik", "i", "sklon", "samostalnik",
|
||||
"nominative", "n", "Case", "Noun"),
|
||||
("imenovalnik", "i", "sklon", "zaimek",
|
||||
"nominative", "n", "Case", "Pronoun"),
|
||||
("imenovalnik", "i", "sklon", "števnik",
|
||||
"nominative", "n", "Case", "Numeral"),
|
||||
("kazalni", "k", "vrsta", "zaimek",
|
||||
"demonstrative", "d", "Type", "Pronoun"),
|
||||
("klitična", "k", "naslonskost", "zaimek",
|
||||
"yes", "y", "Clitic", "Pronoun"),
|
||||
("lastno_ime", "l", "vrsta", "samostalnik",
|
||||
"proper", "p", "Type", "Noun"),
|
||||
("mestnik", "m", "sklon", "predlog",
|
||||
"locative", "l", "Case", "Adposition"),
|
||||
("mestnik", "m", "sklon", "pridevnik",
|
||||
"locative", "l", "Case", "Adjective"),
|
||||
("mestnik", "m", "sklon", "samostalnik",
|
||||
"locative", "l", "Case", "Noun"),
|
||||
("mestnik", "m", "sklon", "zaimek",
|
||||
"locative", "l", "Case", "Pronoun"),
|
||||
("mestnik", "m", "sklon", "števnik",
|
||||
"locative", "l", "Case", "Numeral"),
|
||||
("množina", "m", "število", "glagol",
|
||||
"plural", "p", "Number", "Verb"),
|
||||
("množina", "m", "število", "pridevnik",
|
||||
"plural", "p", "Number", "Adjective"),
|
||||
("množina", "m", "število", "samostalnik",
|
||||
"plural", "p", "Number", "Noun"),
|
||||
("množina", "m", "število", "zaimek",
|
||||
"plural", "p", "Number", "Pronoun"),
|
||||
("množina", "m", "število", "števnik",
|
||||
"plural", "p", "Number", "Numeral"),
|
||||
("množina", "m", "število_svojine", "zaimek",
|
||||
"plural", "p", "Owner_Number", "Pronoun"),
|
||||
("moški", "m", "spol", "glagol",
|
||||
"masculine", "m", "Gender", "Verb"),
|
||||
("moški", "m", "spol", "pridevnik",
|
||||
"masculine", "m", "Gender", "Adjective"),
|
||||
("moški", "m", "spol", "samostalnik",
|
||||
"masculine", "m", "Gender", "Noun"),
|
||||
("moški", "m", "spol", "zaimek",
|
||||
"masculine", "m", "Gender", "Pronoun"),
|
||||
("moški", "m", "spol", "števnik",
|
||||
"masculine", "m", "Gender", "Numeral"),
|
||||
("moški", "m", "spol_svojine", "zaimek",
|
||||
"masculine", "m", "Owner_Gender", "Pronoun"),
|
||||
("namenilnik", "m", "oblika", "glagol",
|
||||
"supine", "u", "VForm", "Verb"),
|
||||
("navezna", "z", "naslonskost", "zaimek",
|
||||
"bound", "b", "Clitic", "Pronoun"),
|
||||
("ne", "n", "določnost", "pridevnik",
|
||||
"no", "n", "Definiteness", "Adjective"),
|
||||
("ne", "n", "določnost", "števnik",
|
||||
"no", "n", "Definiteness", "Numeral"),
|
||||
("ne", "n", "živost", "samostalnik",
|
||||
"no", "n", "Animate", "Noun"),
|
||||
("nedoločeno", "n", "stopnja", "pridevnik",
|
||||
"positive", "p", "Degree", "Adjective"),
|
||||
("nedoločeno", "n", "stopnja", "prislov",
|
||||
"positive", "p", "Degree", "Adverb"),
|
||||
("nedoločni", "n", "vrsta", "zaimek",
|
||||
"indefinite", "i", "Type", "Pronoun"),
|
||||
("nedoločnik", "n", "oblika", "glagol",
|
||||
"infinitive", "n", "VForm", "Verb"),
|
||||
("nedovršni", "n", "vid", "glagol",
|
||||
"progressive", "p", "Aspect", "Verb"),
|
||||
("nezanikani", "n", "nikalnost", "glagol",
|
||||
"no", "n", "Negative", "Verb"),
|
||||
("nikalni", "l", "vrsta", "zaimek",
|
||||
"negative", "z", "Type", "Pronoun"),
|
||||
("občno_ime", "o", "vrsta", "samostalnik",
|
||||
"common", "c", "Type", "Noun"),
|
||||
("orodnik", "o", "sklon", "predlog",
|
||||
"instrumental", "i", "Case", "Adposition"),
|
||||
("orodnik", "o", "sklon", "pridevnik",
|
||||
"instrumental", "i", "Case", "Adjective"),
|
||||
("orodnik", "o", "sklon", "samostalnik",
|
||||
"instrumental", "i", "Case", "Noun"),
|
||||
("orodnik", "o", "sklon", "zaimek",
|
||||
"instrumental", "i", "Case", "Pronoun"),
|
||||
("orodnik", "o", "sklon", "števnik",
|
||||
"instrumental", "i", "Case", "Numeral"),
|
||||
("osebni", "o", "vrsta", "zaimek",
|
||||
"personal", "p", "Type", "Pronoun"),
|
||||
("oziralni", "z", "vrsta", "zaimek",
|
||||
"relative", "r", "Type", "Pronoun"),
|
||||
("podredni", "d", "vrsta", "veznik",
|
||||
"subordinating", "s", "Type", "Conjunction"),
|
||||
("pogojnik", "g", "oblika", "glagol",
|
||||
"conditional", "c", "VForm", "Verb"),
|
||||
("pomožni", "p", "vrsta", "glagol",
|
||||
"auxiliary", "a", "Type", "Verb"),
|
||||
("povratni", "p", "vrsta", "zaimek",
|
||||
"reflexive", "x", "Type", "Pronoun"),
|
||||
("presežnik", "s", "stopnja", "pridevnik",
|
||||
"superlative", "s", "Degree", "Adjective"),
|
||||
("presežnik", "s", "stopnja", "prislov",
|
||||
"superlative", "s", "Degree", "Adverb"),
|
||||
("prihodnjik", "p", "oblika", "glagol",
|
||||
"future", "f", "VForm", "Verb"),
|
||||
("primernik", "p", "stopnja", "pridevnik",
|
||||
"comparative", "c", "Degree", "Adjective"),
|
||||
("primernik", "r", "stopnja", "prislov",
|
||||
"comparative", "c", "Degree", "Adverb"),
|
||||
("priredni", "p", "vrsta", "veznik",
|
||||
"coordinating", "c", "Type", "Conjunction"),
|
||||
("program", "p", "vrsta", "neuvrščeno",
|
||||
"program", "p", "Type", "Residual"),
|
||||
("prva", "p", "oseba", "glagol",
|
||||
"first", "1", "Person", "Verb"),
|
||||
("prva", "p", "oseba", "zaimek",
|
||||
"first", "1", "Person", "Pronoun"),
|
||||
("rimski", "r", "zapis", "števnik",
|
||||
"roman", "r", "Form", "Numeral"),
|
||||
("rodilnik", "r", "sklon", "predlog",
|
||||
"genitive", "g", "Case", "Adposition"),
|
||||
("rodilnik", "r", "sklon", "pridevnik",
|
||||
"genitive", "g", "Case", "Adjective"),
|
||||
("rodilnik", "r", "sklon", "samostalnik",
|
||||
"genitive", "g", "Case", "Noun"),
|
||||
("rodilnik", "r", "sklon", "zaimek",
|
||||
"genitive", "g", "Case", "Pronoun"),
|
||||
("rodilnik", "r", "sklon", "števnik",
|
||||
"genitive", "g", "Case", "Numeral"),
|
||||
("sedanjik", "s", "oblika", "glagol",
|
||||
"present", "r", "VForm", "Verb"),
|
||||
("splošni", "p", "vrsta", "pridevnik",
|
||||
"general", "g", "Type", "Adjective"),
|
||||
("splošni", "s", "vrsta", "prislov",
|
||||
"general", "g", "Type", "Adverb"),
|
||||
("srednji", "s", "spol", "glagol",
|
||||
"neuter", "n", "Gender", "Verb"),
|
||||
("srednji", "s", "spol", "pridevnik",
|
||||
"neuter", "n", "Gender", "Adjective"),
|
||||
("srednji", "s", "spol", "samostalnik",
|
||||
"neuter", "n", "Gender", "Noun"),
|
||||
("srednji", "s", "spol", "zaimek",
|
||||
"neuter", "n", "Gender", "Pronoun"),
|
||||
("srednji", "s", "spol", "števnik",
|
||||
"neuter", "n", "Gender", "Numeral"),
|
||||
("srednji", "s", "spol_svojine", "zaimek",
|
||||
"neuter", "n", "Owner_Gender", "Pronoun"),
|
||||
("svojilni", "s", "vrsta", "pridevnik",
|
||||
"possessive", "s", "Type", "Adjective"),
|
||||
("svojilni", "s", "vrsta", "zaimek",
|
||||
"possessive", "s", "Type", "Pronoun"),
|
||||
("tipkarska", "t", "vrsta", "neuvrščeno",
|
||||
"typo", "t", "Type", "Residual"),
|
||||
("tožilnik", "t", "sklon", "predlog",
|
||||
"accusative", "a", "Case", "Adposition"),
|
||||
("tožilnik", "t", "sklon", "pridevnik",
|
||||
"accusative", "a", "Case", "Adjective"),
|
||||
("tožilnik", "t", "sklon", "samostalnik",
|
||||
"accusative", "a", "Case", "Noun"),
|
||||
("tožilnik", "t", "sklon", "zaimek",
|
||||
"accusative", "a", "Case", "Pronoun"),
|
||||
("tožilnik", "t", "sklon", "števnik",
|
||||
"accusative", "a", "Case", "Numeral"),
|
||||
("tretja", "t", "oseba", "glagol",
|
||||
"third", "3", "Person", "Verb"),
|
||||
("tretja", "t", "oseba", "zaimek",
|
||||
"third", "3", "Person", "Pronoun"),
|
||||
("tujejezično", "j", "vrsta", "neuvrščeno",
|
||||
"foreign", "f", "Type", "Residual"),
|
||||
("velelnik", "v", "oblika", "glagol",
|
||||
"imperative", "m", "VForm", "Verb"),
|
||||
("vprašalni", "v", "vrsta", "zaimek",
|
||||
"interrogative", "q", "Type", "Pronoun"),
|
||||
("vrstilni", "v", "vrsta", "števnik",
|
||||
"ordinal", "o", "Type", "Numeral"),
|
||||
("zaimkovni", "z", "vrsta", "števnik",
|
||||
"pronominal", "p", "Type", "Numeral"),
|
||||
("zanikani", "d", "nikalnost", "glagol",
|
||||
"yes", "y", "Negative", "Verb"),
|
||||
("ženski", "z", "spol", "glagol",
|
||||
"feminine", "f", "Gender", "Verb"),
|
||||
("ženski", "z", "spol", "pridevnik",
|
||||
"feminine", "f", "Gender", "Adjective"),
|
||||
("ženski", "z", "spol", "samostalnik",
|
||||
"feminine", "f", "Gender", "Noun"),
|
||||
("ženski", "z", "spol", "zaimek",
|
||||
"feminine", "f", "Gender", "Pronoun"),
|
||||
("ženski", "z", "spol", "števnik",
|
||||
"feminine", "f", "Gender", "Numeral"),
|
||||
("ženski", "z", "spol_svojine", "zaimek",
|
||||
"feminine", "f", "Owner_Gender", "Pronoun"),
|
||||
]
|
||||
|
||||
def pos_slo_ang_map(self, col, query):
|
||||
for pos in self.pos_slo_ang:
|
||||
if pos[col] == query:
|
||||
return pos
|
||||
raise ValueError("Wrong part of speech.")
|
||||
|
||||
def pos_val_map(self, en_category, col, query):
|
||||
for pos in self.pos_val:
|
||||
if pos[7] == en_category and pos[col] == query:
|
||||
return pos
|
||||
raise ValueError("Wrong part of speech value.")
|
||||
|
||||
def msd_from_slo(self, msd):
|
||||
pos = self.pos_slo_ang_map(1, msd[0])
|
||||
category = pos[2]
|
||||
attr = [self.pos_val_map(category, 1, m)
|
||||
for m in msd[1:] if m != "-"]
|
||||
return pos, attr
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_msds = [
|
||||
"Soser",
|
||||
"Ppnzmm",
|
||||
"Gp-d-mz"
|
||||
]
|
||||
|
||||
for msd in test_msds:
|
||||
print(Msdmap().msd_from_slo(msd))
|
|
@ -7,80 +7,182 @@ S_TAGS = ['S', 'pc']
|
|||
|
||||
# reads a TEI xml file and returns a dictionary:
|
||||
# { <sentence_id>: {
|
||||
# sid: <sentence_id>, # serves as index in MongoDB
|
||||
# text: ,
|
||||
# tokens: ,
|
||||
# sid: <sentence_id>, # serves as index in MongoDB
|
||||
# text: ,
|
||||
# tokens: ,
|
||||
# }}
|
||||
|
||||
|
||||
def parse_tei(filepath):
|
||||
guess_corpus = None # SSJ | KRES
|
||||
res_dict = {}
|
||||
with open(filepath, "r") as fp:
|
||||
# remove namespaces
|
||||
xmlstr = fp.read()
|
||||
xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
|
||||
xmlstr = re.sub(' xml:', ' ', xmlstr)
|
||||
guess_corpus = None # SSJ | KRES
|
||||
res_dict = {}
|
||||
with open(filepath, "r") as fp:
|
||||
# remove namespaces
|
||||
xmlstr = fp.read()
|
||||
xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
|
||||
xmlstr = re.sub(' xml:', ' ', xmlstr)
|
||||
|
||||
root = etree.XML(xmlstr.encode("utf-8"))
|
||||
root = etree.XML(xmlstr.encode("utf-8"))
|
||||
|
||||
divs = [] # in ssj, there are divs, in Kres, there are separate files
|
||||
if "id" in root.keys():
|
||||
# Kres files start with <TEI id=...>
|
||||
guess_corpus = "KRES"
|
||||
divs = [root]
|
||||
else:
|
||||
guess_corpus = "SSJ"
|
||||
divs = root.findall(".//div")
|
||||
divs = [] # in ssj, there are divs, in Kres, there are separate files
|
||||
if "id" in root.keys():
|
||||
# Kres files start with <TEI id=...>
|
||||
guess_corpus = "KRES"
|
||||
divs = [root]
|
||||
else:
|
||||
guess_corpus = "SSJ"
|
||||
divs = root.findall(".//div")
|
||||
|
||||
# parse divs
|
||||
for div in divs:
|
||||
f_id = div.get("id")
|
||||
# parse divs
|
||||
for div in divs:
|
||||
f_id = div.get("id")
|
||||
|
||||
# parse paragraphs
|
||||
for p in div.findall(".//p"):
|
||||
p_id = p.get("id").split(".")[-1]
|
||||
# parse paragraphs
|
||||
for p in div.findall(".//p"):
|
||||
p_id = p.get("id").split(".")[-1]
|
||||
|
||||
# parse sentences
|
||||
for s in p.findall(".//s"):
|
||||
s_id = s.get("id").split(".")[-1]
|
||||
sentence_text = ""
|
||||
sentence_tokens = []
|
||||
# parse sentences
|
||||
for s in p.findall(".//s"):
|
||||
s_id = s.get("id").split(".")[-1]
|
||||
sentence_text = ""
|
||||
sentence_tokens = []
|
||||
|
||||
# parse tokens
|
||||
for el in s.iter():
|
||||
if el.tag in W_TAGS:
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
el_id,
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
)]
|
||||
elif el.tag in C_TAGS:
|
||||
# only Kres' C_TAGS have ids
|
||||
el_id = el.get("id") or "none"
|
||||
el_id = el_id.split(".")[-1]
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
elif el.tag in S_TAGS:
|
||||
# Kres' <S /> doesn't contain .text
|
||||
sentence_text += " "
|
||||
else:
|
||||
# pass links and linkGroups
|
||||
# print(el.tag)
|
||||
pass
|
||||
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
|
||||
"""
|
||||
print(sentence_id)
|
||||
print(sentence_text)
|
||||
print(sentence_tokens)
|
||||
"""
|
||||
if sentence_id in res_dict:
|
||||
raise KeyError("duplicated id: {}".format(sentence_id))
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens
|
||||
}
|
||||
return res_dict
|
||||
|
||||
|
||||
def msd_slo_to_ang(msd):
|
||||
# mapping table: http://nl.ijs.si/imp/msd/html-sl/#msd.index.values
|
||||
# 3.1.1: list of POS
|
||||
# 3.3.1: list of values
|
||||
|
||||
msd = "Sosei"
|
||||
|
||||
def slo_pos(msd):
|
||||
return msd[0]
|
||||
|
||||
def pos_slo_ang_map(col, query):
|
||||
pos_slo_ang = [
|
||||
("samostalnik", "S", "Noun", "N"),
|
||||
("glagol", "G", "Verb", "V"),
|
||||
("pridevnik", "P", "Adjective", "A"),
|
||||
("prislov", "R", "Adverb", "R"),
|
||||
("zaimek", "Z", "Pronoun", "P"),
|
||||
("števnik", "K", "Numeral", "M"),
|
||||
("predlog", "D", "Preposition", "S"),
|
||||
("veznik", "V", "Conjunction", "C"),
|
||||
("členek", "L", "Particle", "Q"),
|
||||
("medmet", "M", "Interjection", "I"),
|
||||
("okrajšava", "O", "Abbreviation", "Y"),
|
||||
("neuvrščeno", "N", "Residual", "X"),
|
||||
]
|
||||
for pos in pos_slo_ang:
|
||||
if pos[col] == query:
|
||||
return pos
|
||||
raise ValueError("Wrong part of speech.")
|
||||
|
||||
def pos_val_map(col, query):
|
||||
# col:
|
||||
# (sl_vrednost, sl_koda, sl_atribut, sl_kategorija,
|
||||
# en_vrednost, en_koda, en_atribut, en_kategorija)
|
||||
pos_val = [
|
||||
("arabski", "a", "zapis", "števnik",
|
||||
"digit", "d", "Form", "Numeral"),
|
||||
("besedni", "b", "zapis", "števnik",
|
||||
"letter", "l", "Form", "Numeral"),
|
||||
("deležje", "d", "vrsta", "prislov",
|
||||
"participle", "r", "Type", "Adverb"),
|
||||
("deležniški", "d", "vrsta", "pridevnik",
|
||||
" participle", "p", "Type", "Adjective"),
|
||||
("dovršni", "d", "vid", "glagol",
|
||||
" perfective", "e", "Aspect", "Verb"),
|
||||
("dvovidski", "v", "vid", "glagol",
|
||||
" biaspectual", "b", "Aspect", "Verb"),
|
||||
("glavni", "g", "vrsta", "glagol",
|
||||
" main", "m", "Type", "Verb"),
|
||||
("lastno_ime", "l", "vrsta", "samostalnik",
|
||||
"proper", "p", "Type", "Noun"),
|
||||
("moški", "m", "spol", "samostalnik",
|
||||
"masculine", "m", "Gender", "Noun"),
|
||||
("nedoločeno", "n", "stopnja", "pridevnik",
|
||||
" positive", "p", "Degree", "Adjective"),
|
||||
("nedoločeno", "n", "stopnja", "prislov",
|
||||
"positive", "p", "Degree", "Adverb"),
|
||||
("nedovršni", "n", "vid", "glagol",
|
||||
" progressive", "p", "Aspect", "Verb"),
|
||||
("občno_ime", "o", "vrsta", "samostalnik",
|
||||
"common", "c", "Type", "Noun"),
|
||||
("pomožni", "p", "vrsta", "glagol",
|
||||
" auxiliary", "a", "Type", "Verb"),
|
||||
("presežnik", "s", "stopnja", "pridevnik",
|
||||
" superlative", "s", "Degree", "Adjective"),
|
||||
("presežnik", "s", "stopnja", "prislov",
|
||||
"superlative", "s", "Degree", "Adverb"),
|
||||
("primernik", "p", "stopnja", "pridevnik",
|
||||
" comparative", "c", "Degree", "Adjective"),
|
||||
("primernik", "r", "stopnja", "prislov",
|
||||
"comparative", "c", "Degree", "Adverb"),
|
||||
("program", "p", "vrsta", "neuvrščeno",
|
||||
" program", "p", "Type", "Residual"),
|
||||
("rimski", "r", "zapis", "števnik",
|
||||
"roman", "r", "Form", "Numeral"),
|
||||
("splošni", "p", "vrsta", "pridevnik",
|
||||
" general", "g", "Type", "Adjective"),
|
||||
("splošni", "s", "vrsta", "prislov",
|
||||
"general", "g", "Type", "Adverb"),
|
||||
("srednji", "s", "spol", "samostalnik",
|
||||
"neuter", "n", "Gender", "Noun"),
|
||||
("svojilni", "s", "vrsta", "pridevnik",
|
||||
" possessive", "s", "Type", "Adjective"),
|
||||
("tipkarska", "t", "vrsta", "neuvrščeno",
|
||||
" typo", "t", "Type", "Residual"),
|
||||
("tujejezično", "j", "vrsta", "neuvrščeno",
|
||||
" foreign", "f", "Type", "Residual"),
|
||||
("ženski", "z", "spol", "samostalnik",
|
||||
"feminine", "f", "Gender", "Noun"),
|
||||
]
|
||||
for pos in pos_val:
|
||||
if pos[col] == query:
|
||||
return pos
|
||||
raise ValueError("Wrong part of speech value.")
|
||||
|
||||
# parse tokens
|
||||
for el in s.iter():
|
||||
if el.tag in W_TAGS:
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
el_id,
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES" else el.get("ana").split(":")[-1]),
|
||||
)]
|
||||
elif el.tag in C_TAGS:
|
||||
el_id = el.get("id") or "none" # only Kres' C_TAGS have ids
|
||||
el_id = el_id.split(".")[-1]
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
elif el.tag in S_TAGS:
|
||||
sentence_text += " " # Kres' <S /> doesn't contain .text
|
||||
else:
|
||||
# pass links and linkGroups
|
||||
# print(el.tag)
|
||||
pass
|
||||
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
|
||||
"""
|
||||
print(sentence_id)
|
||||
print(sentence_text)
|
||||
print(sentence_tokens)
|
||||
"""
|
||||
if sentence_id in res_dict:
|
||||
raise KeyError("duplicated id: {}".format(sentence_id))
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens
|
||||
}
|
||||
return res_dict
|
||||
|
|
Loading…
Reference in New Issue
Block a user