Added s/z, k/h + fixed bug 90 + connecting with sloleks on lemma_fallback
This commit is contained in:
parent
ec113f9cd2
commit
777791ad1e
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,3 +9,4 @@ __pycache__
|
|||
|
||||
prev
|
||||
old
|
||||
data
|
||||
|
|
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
|
@ -1,3 +1,129 @@
|
|||
CODES_TRANSLATION = {
|
||||
"N": {
|
||||
2: {
|
||||
'm': 'masculine',
|
||||
'f': 'feminine',
|
||||
'n': 'neuter',
|
||||
},
|
||||
3: {
|
||||
"s": "singular",
|
||||
"d": "dual",
|
||||
"p": "plural",
|
||||
},
|
||||
4: {
|
||||
"n": "nominative",
|
||||
"g": "genitive",
|
||||
"d": "dative",
|
||||
"a": "accusative",
|
||||
"l": "locative",
|
||||
"i": "instrumental",
|
||||
},
|
||||
},
|
||||
"V": {
|
||||
1: {
|
||||
"m": "main",
|
||||
"a": "auxiliary",
|
||||
},
|
||||
3: {
|
||||
"n": "infinitive",
|
||||
"u": "supine",
|
||||
"p": "participle",
|
||||
"r": "present",
|
||||
"f": "future",
|
||||
"c": "conditional",
|
||||
"m": "imperative",
|
||||
},
|
||||
4: {
|
||||
"1": "first",
|
||||
"2": "second",
|
||||
"3": "third",
|
||||
},
|
||||
5: {
|
||||
"s": "singular",
|
||||
"d": "dual",
|
||||
"p": "plural",
|
||||
},
|
||||
6: {
|
||||
'm': 'masculine',
|
||||
'f': 'feminine',
|
||||
'n': 'neuter',
|
||||
},
|
||||
8: {
|
||||
"n": "no",
|
||||
"y": "yes",
|
||||
},
|
||||
},
|
||||
"A": {
|
||||
1: {
|
||||
"g": "general",
|
||||
"s": "possessive",
|
||||
},
|
||||
2: {
|
||||
"p": "positive",
|
||||
"c": "comparative",
|
||||
"s": "superlative",
|
||||
},
|
||||
3: {
|
||||
'm': 'masculine',
|
||||
'f': 'feminine',
|
||||
'n': 'neuter',
|
||||
},
|
||||
4: {
|
||||
"s": "singular",
|
||||
"d": "dual",
|
||||
"p": "plural",
|
||||
},
|
||||
5: {
|
||||
"n": "nominative",
|
||||
"g": "genitive",
|
||||
"d": "dative",
|
||||
"a": "accusative",
|
||||
"l": "locative",
|
||||
"i": "instrumental",
|
||||
},
|
||||
}
|
||||
# "R": "Adverb",
|
||||
# "P": "Pronoun",
|
||||
# "M": "Numeral",
|
||||
# "S": "Preposition",
|
||||
# "C": "Conjunction",
|
||||
# "Q": "Particle",
|
||||
# "I": "Interjection",
|
||||
# "Y": "Abbreviation",
|
||||
# "X": "Residual",
|
||||
#
|
||||
#
|
||||
# "e": "perfective",
|
||||
# "p": "progressive",
|
||||
# "b": "biaspectual",
|
||||
#
|
||||
#
|
||||
# "p": "personal",
|
||||
# "d": "demonstrative",
|
||||
# "r": "relative",
|
||||
# "x": "reflexive",
|
||||
# "q": "interrogative",
|
||||
# "i": "indefinite",
|
||||
# "z": "negative",
|
||||
# "b": "bound",
|
||||
# "d": "digit",
|
||||
# "r": "roman",
|
||||
# "l": "letter",
|
||||
# "c": "cardinal",
|
||||
# "o": "ordinal",
|
||||
# "p": "pronominal",
|
||||
# "s": "special",
|
||||
# "c": "coordinating",
|
||||
# "s": "subordinating",
|
||||
# "f": "foreign",
|
||||
# "t": "typo",
|
||||
# "p": "program",
|
||||
# "w": "web",
|
||||
# "e": "emo",
|
||||
# "h": "hashtag",
|
||||
# "a: "at""
|
||||
}
|
||||
|
||||
CODES = {
|
||||
"Noun": "N",
|
||||
"Verb": "V",
|
||||
|
|
|
@ -124,9 +124,9 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
|||
dest = l.get('dep')
|
||||
else:
|
||||
ana = l.get('ana')
|
||||
if ana[:4] != 'syn:': # dont bother...
|
||||
if ana[:8] != 'jos-syn:': # dont bother...
|
||||
continue
|
||||
ana = ana[4:]
|
||||
ana = ana[8:]
|
||||
lfrom, dest = l.get('target').replace('#', '').split()
|
||||
|
||||
if lfrom in words:
|
||||
|
|
|
@ -91,7 +91,14 @@ class MatchStore:
|
|||
(structure.id,)):
|
||||
yield StructureMatch.from_db(self.db, cid[0], structure)
|
||||
|
||||
def set_representations(self, word_renderer, structures):
|
||||
def add_inserts(self, inserts):
|
||||
for match in inserts:
|
||||
for component_id, text in match.representations.items():
|
||||
self.db.execute("""
|
||||
INSERT INTO Representations (colocation_id, component_id, text)
|
||||
VALUES (?,?,?)""", (match.match_id, component_id, text))
|
||||
|
||||
def set_representations(self, word_renderer, structures, sloleks_db=None):
|
||||
step_name = 'representation'
|
||||
if self.db.is_step_done(step_name):
|
||||
print("Representation step already done, skipping")
|
||||
|
@ -105,17 +112,14 @@ class MatchStore:
|
|||
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
|
||||
structure = structures_dict[sid]
|
||||
match = StructureMatch.from_db(self.db, cid, structure)
|
||||
RepresentationAssigner.set_representations(match, word_renderer)
|
||||
RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db)
|
||||
|
||||
inserts.append(match)
|
||||
if len(inserts) > num_inserts:
|
||||
for match in inserts:
|
||||
for component_id, text in match.representations.items():
|
||||
self.db.execute("""
|
||||
INSERT INTO Representations (colocation_id, component_id, text)
|
||||
VALUES (?,?,?)""", (match.match_id, component_id, text))
|
||||
self.add_inserts(inserts)
|
||||
inserts = []
|
||||
|
||||
self.add_inserts(inserts)
|
||||
self.db.step_is_done(step_name)
|
||||
|
||||
def has_colocation_id_enough_frequency(self, colocation_id):
|
||||
|
|
38
src/postprocessor.py
Normal file
38
src/postprocessor.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
|
||||
class Postprocessor:
|
||||
def __init__(self, fix_one_letter_words=True):
|
||||
self.fix_one_letter_words = fix_one_letter_words
|
||||
|
||||
@staticmethod
|
||||
def fix_sz(next_word):
|
||||
if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']:
|
||||
return 's'
|
||||
return 'z'
|
||||
|
||||
@staticmethod
|
||||
def fix_kh(next_word):
|
||||
if next_word[0] in ['g', 'k']:
|
||||
return 'h'
|
||||
return 'k'
|
||||
|
||||
def process(self, match, collocation_id):
|
||||
# self.matches = matches
|
||||
# if self.fix_one_letter_words:
|
||||
# for syn_structure_key, syn_structure_value in self.matches.items():
|
||||
# for match, collocation_id in syn_structure_value:
|
||||
if len(collocation_id) > 2:
|
||||
# a = collocation_id[1:-1]
|
||||
# b = enumerate(collocation_id[1:-1])
|
||||
# for a, c in b:
|
||||
# print('here')
|
||||
for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
|
||||
if word in ['s', 'z']:
|
||||
correct_letter = self.fix_sz(collocation_id[idx + 2][1])
|
||||
collocation_id[idx + 1][1] = correct_letter
|
||||
match[col_id].text = correct_letter
|
||||
elif word in ['k', 'h']:
|
||||
correct_letter = self.fix_kh(collocation_id[idx + 2][1])
|
||||
collocation_id[idx + 1][1] = correct_letter
|
||||
match[col_id].text = correct_letter
|
||||
collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]]
|
||||
return match, collocation_id
|
|
@ -4,6 +4,9 @@ from collections import Counter
|
|||
from codes_tagset import TAGSET, CODES
|
||||
from word import WordMsdOnly
|
||||
|
||||
from src.word import WordDummy
|
||||
|
||||
|
||||
class ComponentRepresentation:
|
||||
def __init__(self, data, word_renderer):
|
||||
self.data = data
|
||||
|
@ -19,23 +22,23 @@ class ComponentRepresentation:
|
|||
def add_word(self, word):
|
||||
self.words.append(word)
|
||||
|
||||
def render(self):
|
||||
def render(self, sloleks_db=None):
|
||||
if self.rendition_text is None:
|
||||
self.rendition_text = self._render()
|
||||
self.rendition_text = self._render(sloleks_db=sloleks_db)
|
||||
|
||||
def _render(self):
|
||||
def _render(self, sloleks_db=None):
|
||||
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
|
||||
|
||||
class LemmaCR(ComponentRepresentation):
|
||||
def _render(self):
|
||||
def _render(self, sloleks_db=None):
|
||||
return self.words[0].lemma if len(self.words) > 0 else None
|
||||
|
||||
class LexisCR(ComponentRepresentation):
|
||||
def _render(self):
|
||||
def _render(self, sloleks_db=None):
|
||||
return self.data['lexis']
|
||||
|
||||
class WordFormAllCR(ComponentRepresentation):
|
||||
def _render(self):
|
||||
def _render(self, sloleks_db=None):
|
||||
if len(self.words) == 0:
|
||||
return None
|
||||
else:
|
||||
|
@ -43,7 +46,7 @@ class WordFormAllCR(ComponentRepresentation):
|
|||
return "/".join(set(forms))
|
||||
|
||||
class WordFormAnyCR(ComponentRepresentation):
|
||||
def _render(self):
|
||||
def _render(self, sloleks_db=None):
|
||||
text_forms = {}
|
||||
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
|
||||
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
|
||||
|
@ -59,7 +62,24 @@ class WordFormAnyCR(ComponentRepresentation):
|
|||
for word_msd, word_lemma in sorted_words:
|
||||
# check if agreements match
|
||||
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
||||
|
||||
|
||||
# in case all agreements do not match try to get data from sloleks and change properly
|
||||
if not all(agreements_matched):
|
||||
if sloleks_db is None:
|
||||
raise Exception('sloleks_db not properly setup!')
|
||||
for agr in self.agreement:
|
||||
if not agr.match(word_msd):
|
||||
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
|
||||
if msd is not None:
|
||||
agr.msds[0] = msd
|
||||
agr.words.append(WordDummy(msd, lemma, text))
|
||||
# agr.words[0].msd = msd
|
||||
# agr.words[0].text = text
|
||||
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
# if we are at the last "backup word", then confirm matches
|
||||
# that worked for this one and return
|
||||
if word_lemma is None:
|
||||
|
@ -109,9 +129,15 @@ class WordFormMsdCR(WordFormAnyCR):
|
|||
if self.check_msd(word.msd):
|
||||
super().add_word(word)
|
||||
|
||||
def _render(self):
|
||||
def _render(self, sloleks_db=None):
|
||||
if len(self.words) == 0:
|
||||
if sloleks_db is None:
|
||||
raise Exception('sloleks_db not properly setup!')
|
||||
msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
|
||||
if msd is not None:
|
||||
self.words.append(WordDummy(msd, lemma, text))
|
||||
self.words.append(WordMsdOnly(self._common_msd()))
|
||||
return super()._render()
|
||||
return super()._render(sloleks_db)
|
||||
|
||||
def _common_msd(self):
|
||||
msds = sorted(self.msds, key=len)
|
||||
|
@ -182,5 +208,5 @@ class WordFormAgreementCR(WordFormMsdCR):
|
|||
|
||||
return True
|
||||
|
||||
def render(self):
|
||||
def render(self, sloleks_db=None):
|
||||
pass
|
||||
|
|
|
@ -39,7 +39,7 @@ class RepresentationAssigner:
|
|||
return self.representation_factory(self.more, word_renderer)
|
||||
|
||||
@staticmethod
|
||||
def set_representations(match, word_renderer):
|
||||
def set_representations(match, word_renderer, sloleks_db=None):
|
||||
representations = {}
|
||||
for c in match.structure.components:
|
||||
representations[c.idx] = []
|
||||
|
@ -70,7 +70,7 @@ class RepresentationAssigner:
|
|||
|
||||
for cid, reps in representations.items():
|
||||
for rep in reps:
|
||||
rep.render()
|
||||
rep.render(sloleks_db=sloleks_db)
|
||||
|
||||
for cid, reps in representations.items():
|
||||
reps = [rep.rendition_text for rep in reps]
|
||||
|
|
191
src/sloleks_db.py
Normal file
191
src/sloleks_db.py
Normal file
|
@ -0,0 +1,191 @@
|
|||
from collections import defaultdict
|
||||
from ast import literal_eval
|
||||
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import Session, aliased
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy import func
|
||||
|
||||
from match import StructureMatch
|
||||
from representation_assigner import RepresentationAssigner
|
||||
from progress_bar import progress
|
||||
|
||||
# Lexeme = None
|
||||
# LexemeFeature = None
|
||||
# SyntacticStructure = None
|
||||
# StructureComponent = None
|
||||
# Feature = None
|
||||
# LexicalUnitLexeme = None
|
||||
# LexicalUnit = None
|
||||
# LexicalUnitType = None
|
||||
# Category = None
|
||||
# Sense = None
|
||||
# Measure = None
|
||||
# LexicalUnitMeasure = None
|
||||
# Corpus = None
|
||||
# Definition = None
|
||||
# WordForm = None
|
||||
# WordFormFeature = None
|
||||
# FormRepresentation = None
|
||||
from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
|
||||
|
||||
|
||||
class SloleksDatabase:
|
||||
def __init__(self, db):
|
||||
# self.db = db
|
||||
# self.dispersions = {}
|
||||
# self.min_freq = args.min_freq
|
||||
|
||||
# self.db.init("""CREATE TABLE Colocations (
|
||||
# colocation_id INTEGER PRIMARY KEY,
|
||||
# structure_id varchar(8),
|
||||
# key varchar(256))
|
||||
# """)
|
||||
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||
|
||||
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
||||
pool_recycle=14400)
|
||||
Base = declarative_base()
|
||||
Base.metadata.reflect(engine)
|
||||
|
||||
class Lexeme(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexeme']
|
||||
|
||||
class LexemeFeature(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexeme_feature']
|
||||
|
||||
class SyntacticStructure(Base):
|
||||
__table__ = Base.metadata.tables['jedro_syntacticstructure']
|
||||
|
||||
class StructureComponent(Base):
|
||||
__table__ = Base.metadata.tables['jedro_structurecomponent']
|
||||
|
||||
class Feature(Base):
|
||||
__table__ = Base.metadata.tables['jedro_feature']
|
||||
|
||||
class LexicalUnitLexeme(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
|
||||
|
||||
class LexicalUnit(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexicalunit']
|
||||
|
||||
class LexicalUnitType(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexicalunittype']
|
||||
|
||||
class Category(Base):
|
||||
__table__ = Base.metadata.tables['jedro_category']
|
||||
|
||||
class Sense(Base):
|
||||
__table__ = Base.metadata.tables['jedro_sense']
|
||||
|
||||
class Measure(Base):
|
||||
__table__ = Base.metadata.tables['jedro_measure']
|
||||
|
||||
class LexicalUnitMeasure(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
|
||||
|
||||
class Corpus(Base):
|
||||
__table__ = Base.metadata.tables['jedro_corpus']
|
||||
|
||||
class Definition(Base):
|
||||
__table__ = Base.metadata.tables['jedro_definition']
|
||||
|
||||
class WordForm(Base):
|
||||
__table__ = Base.metadata.tables['jedro_wordform']
|
||||
|
||||
class WordFormFeature(Base):
|
||||
__table__ = Base.metadata.tables['jedro_wordform_feature']
|
||||
|
||||
class FormRepresentation(Base):
|
||||
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
||||
|
||||
self.session = Session(engine)
|
||||
|
||||
def close(self):
|
||||
self.session.close()
|
||||
|
||||
def decypher_msd(self, msd):
|
||||
t = msd[0]
|
||||
decypher = []
|
||||
if t == 'N':
|
||||
# gender = CODES_TRANSLATION[t][2][msd[2]]
|
||||
number = CODES_TRANSLATION[t][3][msd[3]]
|
||||
case = CODES_TRANSLATION[t][4][msd[4]]
|
||||
decypher = [number, case]
|
||||
elif t == 'V':
|
||||
# gender = CODES_TRANSLATION[t][6][msd[6]]
|
||||
vform = CODES_TRANSLATION[t][3][msd[3]]
|
||||
number = CODES_TRANSLATION[t][5][msd[5]]
|
||||
person = 'third'
|
||||
decypher = [vform, number, person]
|
||||
elif t == 'A':
|
||||
gender = CODES_TRANSLATION[t][3][msd[3]]
|
||||
number = CODES_TRANSLATION[t][4][msd[4]]
|
||||
case = CODES_TRANSLATION[t][5][msd[5]]
|
||||
decypher = [gender, number, case]
|
||||
|
||||
return decypher
|
||||
|
||||
def get_word_form(self, lemma, msd, data, align_msd=False):
|
||||
# modify msd as required
|
||||
msd = list(msd)
|
||||
|
||||
if not align_msd and 'msd' in data:
|
||||
for key, value in data['msd'].items():
|
||||
t = msd[0]
|
||||
v = TAGSET[t].index(key.lower())
|
||||
msd[v + 1] = CODES[value]
|
||||
|
||||
elif 'agreement' in data:
|
||||
align_msd = list(align_msd)
|
||||
t_align_msd = align_msd[0]
|
||||
t = msd[0]
|
||||
|
||||
for att in data['agreement']:
|
||||
v_align_msd = TAGSET[t_align_msd].index(att.lower())
|
||||
v = TAGSET[t].index(att.lower())
|
||||
# fix for verbs with short msds
|
||||
if v >= len(msd):
|
||||
return None, None, None
|
||||
# if v >= len(msd) and t == 'V' and att == 'number':
|
||||
# if len(msd) == 4:
|
||||
# msd += ['3']
|
||||
# if len(msd) == 5:
|
||||
# msd += ['_']
|
||||
# try:
|
||||
msd[v + 1] = align_msd[v_align_msd + 1]
|
||||
# except:
|
||||
# print('here')
|
||||
|
||||
# msd = list(msd)
|
||||
decypher_msd = self.decypher_msd(msd)
|
||||
|
||||
if not decypher_msd:
|
||||
return None, None, None
|
||||
|
||||
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
||||
# wf1 = aliased(WordFormFeature)
|
||||
# wf2 = aliased(WordFormFeature)
|
||||
# wf3 = aliased(WordFormFeature)
|
||||
query_preposition = self.session.query(FormRepresentation.form) \
|
||||
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
||||
|
||||
for wf in wfs:
|
||||
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
|
||||
# .join(wf1, wf1.word_form_id == WordForm.id) \
|
||||
# .join(wf2, wf2.word_form_id == WordForm.id) \
|
||||
# .join(wf3, wf3.word_form_id == WordForm.id) \
|
||||
|
||||
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
|
||||
|
||||
for wf, msd_el in zip(wfs, decypher_msd):
|
||||
query_preposition = query_preposition.filter(wf.value == msd_el)
|
||||
|
||||
pattern_translation_hws = query_preposition.all()
|
||||
if len(pattern_translation_hws) > 0:
|
||||
return ''.join(msd), lemma, pattern_translation_hws[0][0]
|
||||
# pattern_translation_hws = [el[0] for el in query_preposition.all()]
|
||||
return None, None, None
|
||||
# return pattern_translation_hws
|
|
@ -14,7 +14,7 @@ class SyntacticStructure:
|
|||
@staticmethod
|
||||
def from_xml(xml):
|
||||
st = SyntacticStructure()
|
||||
st.id = xml.get('id')
|
||||
st.id = xml.get('id_nsss')
|
||||
st.lbs = xml.get('LBS')
|
||||
|
||||
assert len(list(xml)) == 1
|
||||
|
|
25
src/wani.py
25
src/wani.py
|
@ -11,6 +11,7 @@ import concurrent.futures
|
|||
import tempfile
|
||||
|
||||
from progress_bar import progress
|
||||
from sloleks_db import SloleksDatabase
|
||||
from word import Word
|
||||
from syntactic_structure import build_structures
|
||||
from match_store import MatchStore
|
||||
|
@ -20,16 +21,20 @@ from loader import load_files
|
|||
from database import Database
|
||||
from time_info import TimeInfo
|
||||
|
||||
from src.postprocessor import Postprocessor
|
||||
|
||||
def match_file(words, structures):
|
||||
|
||||
def match_file(words, structures, postprocessor):
|
||||
matches = {s: [] for s in structures}
|
||||
|
||||
for s in progress(structures, "matching"):
|
||||
for w in words:
|
||||
mhere = s.match(w)
|
||||
for match in mhere:
|
||||
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
||||
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
||||
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
|
||||
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
|
||||
match, collocation_id = postprocessor.process(match, colocation_id)
|
||||
colocation_id = tuple(colocation_id)
|
||||
|
||||
matches[s].append((match, colocation_id))
|
||||
|
@ -38,6 +43,7 @@ def match_file(words, structures):
|
|||
|
||||
|
||||
def main(args):
|
||||
sloleks_db = SloleksDatabase(args.sloleks_db)
|
||||
structures, lemma_msds, max_num_components = build_structures(args)
|
||||
timeinfo = TimeInfo(len(args.input))
|
||||
|
||||
|
@ -51,7 +57,11 @@ def main(args):
|
|||
continue
|
||||
|
||||
start_time = time.time()
|
||||
matches = match_file(words, structures)
|
||||
postprocessor = Postprocessor()
|
||||
matches = match_file(words, structures, postprocessor)
|
||||
|
||||
# matches = .process()
|
||||
# TODO Add postprocessing here or inside previous function!
|
||||
match_store.add_matches(matches)
|
||||
word_stats.add_words(words)
|
||||
database.commit()
|
||||
|
@ -74,7 +84,7 @@ def main(args):
|
|||
|
||||
# figure out representations!
|
||||
if args.out or args.out_no_stat:
|
||||
match_store.set_representations(word_stats, structures)
|
||||
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
|
||||
|
||||
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||
structures, match_store)
|
||||
|
@ -85,6 +95,10 @@ def main(args):
|
|||
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||
structures, match_store)
|
||||
|
||||
# sloleks_db.get_word_form(lemma, gender, number, case)
|
||||
sloleks_db.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract structures from a parsed corpus.')
|
||||
|
@ -92,6 +106,7 @@ if __name__ == '__main__':
|
|||
help='Structures definitions in xml file')
|
||||
parser.add_argument('input',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
||||
parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
|
||||
parser.add_argument('--out',
|
||||
help='Classic output file')
|
||||
parser.add_argument('--out-no-stat',
|
||||
|
@ -100,7 +115,7 @@ if __name__ == '__main__':
|
|||
help='Additional output file, writes more data')
|
||||
parser.add_argument('--stats',
|
||||
help='Output file for statistics')
|
||||
|
||||
#
|
||||
parser.add_argument('--no-msd-translate',
|
||||
help='MSDs are translated from slovene to english by default',
|
||||
action='store_true')
|
||||
|
|
18
src/word.py
18
src/word.py
|
@ -14,6 +14,16 @@ class WordMsdOnly:
|
|||
return None
|
||||
|
||||
|
||||
class WordDummy:
|
||||
def __init__(self, msd, lemma, text):
|
||||
self.msd = msd
|
||||
self.lemma = lemma
|
||||
self.text = text
|
||||
|
||||
def most_frequent_text(self, word_renderer):
|
||||
return word_renderer.render(self.lemma, self.msd)
|
||||
|
||||
|
||||
class Word:
|
||||
def __init__(self, lemma, msd, wid, text, do_msd_translate):
|
||||
self.lemma = lemma
|
||||
|
@ -29,7 +39,7 @@ class Word:
|
|||
self.int_id = int(last_num)
|
||||
|
||||
assert None not in (self.id, self.lemma, self.msd)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_xml(xml, do_msd_translate):
|
||||
lemma = xml.get('lemma')
|
||||
|
@ -41,10 +51,10 @@ class Word:
|
|||
@staticmethod
|
||||
def get_msd(comp):
|
||||
d = dict(comp.items())
|
||||
if 'msd' in d:
|
||||
return d['msd']
|
||||
elif 'ana' in d:
|
||||
if 'ana' in d:
|
||||
return d['ana'][4:]
|
||||
elif 'msd' in d:
|
||||
return d['msd']
|
||||
else:
|
||||
logging.error(d)
|
||||
raise NotImplementedError("MSD?")
|
||||
|
|
Loading…
Reference in New Issue
Block a user