forked from kristjan/cjvt-valency
A couple of fixes on write_xml in create_xml.py + Created form_csv.py script
This commit is contained in:
parent
c18aaff11f
commit
75b015dcda
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#imports from luscenje_struktur
|
#imports from luscenje_struktur
|
||||||
import copy
|
import copy
|
||||||
|
import csv
|
||||||
|
|
||||||
from luscenje_struktur.progress_bar import progress
|
from luscenje_struktur.progress_bar import progress
|
||||||
from luscenje_struktur.word import Word, WordCompressed
|
from luscenje_struktur.word import Word, WordCompressed
|
||||||
|
@ -114,6 +115,8 @@ CASE_MAP = {
|
||||||
'i': 'instrumental'
|
'i': 'instrumental'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ssj_frequency_dict = {}
|
||||||
|
|
||||||
|
|
||||||
Lexeme = None
|
Lexeme = None
|
||||||
LexemeFeature = None
|
LexemeFeature = None
|
||||||
|
@ -130,6 +133,7 @@ Definition = None
|
||||||
WordForm = None
|
WordForm = None
|
||||||
WordFormFeature = None
|
WordFormFeature = None
|
||||||
FormRepresentation = None
|
FormRepresentation = None
|
||||||
|
FormEncoding = None
|
||||||
|
|
||||||
|
|
||||||
# corpus = 'gigafida'
|
# corpus = 'gigafida'
|
||||||
|
@ -745,7 +749,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
|
||||||
|
|
||||||
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
|
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
|
||||||
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
||||||
Lexeme.dummy, LexicalUnitType.name) \
|
Lexeme.potential_lexeme, LexicalUnitType.name) \
|
||||||
.join(Category, Category.id == Lexeme.category_id) \
|
.join(Category, Category.id == Lexeme.category_id) \
|
||||||
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
||||||
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
||||||
|
@ -791,7 +795,8 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
wf1 = aliased(WordFormFeature)
|
wf1 = aliased(WordFormFeature)
|
||||||
wf2 = aliased(WordFormFeature)
|
wf2 = aliased(WordFormFeature)
|
||||||
wf3 = aliased(WordFormFeature)
|
wf3 = aliased(WordFormFeature)
|
||||||
query_preposition = session.query(FormRepresentation.form) \
|
query_preposition = session.query(FormEncoding.text) \
|
||||||
|
.join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
|
||||||
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||||
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
|
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
|
||||||
.join(wf1, wf1.word_form_id == WordForm.id) \
|
.join(wf1, wf1.word_form_id == WordForm.id) \
|
||||||
|
@ -805,7 +810,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
|
|
||||||
pattern_translation_3_sin = headword_text
|
pattern_translation_3_sin = headword_text
|
||||||
if len(pattern_translation_hws) == 1:
|
if len(pattern_translation_hws) == 1:
|
||||||
pattern_translation_3_sin = pattern_translation_hws[0].form
|
pattern_translation_3_sin = pattern_translation_hws[0].text
|
||||||
|
|
||||||
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
|
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
|
||||||
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
|
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
|
||||||
|
@ -816,6 +821,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
headword_text_query = headword_text[:-1]
|
headword_text_query = headword_text[:-1]
|
||||||
else:
|
else:
|
||||||
headword_text_query = headword_text
|
headword_text_query = headword_text
|
||||||
|
|
||||||
query = query_general.filter(Category.name == category_text) \
|
query = query_general.filter(Category.name == category_text) \
|
||||||
.filter(Lexeme.lemma == headword_text_query) \
|
.filter(Lexeme.lemma == headword_text_query) \
|
||||||
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
||||||
|
@ -827,31 +833,91 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
if len(query_res) == 1:
|
if len(query_res) == 1:
|
||||||
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
|
||||||
query_res[0]
|
query_res[0]
|
||||||
|
sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
|
||||||
|
Sense.lexical_unit_id == lexical_unit_id).all()
|
||||||
|
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
||||||
|
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
||||||
|
.filter(Feature.name == 'aspect').all()
|
||||||
|
|
||||||
elif len(query_res) > 1:
|
elif len(query_res) > 1:
|
||||||
|
# find dummy
|
||||||
|
dummy_query = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id,
|
||||||
|
Lexeme.potential_lexeme, LexicalUnitType.name) \
|
||||||
|
.join(Category, Category.id == Lexeme.category_id) \
|
||||||
|
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
||||||
|
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
||||||
|
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
|
||||||
|
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
|
||||||
|
.filter(Corpus.name == 'gigafida') \
|
||||||
|
.filter(Corpus.version == '2.0') \
|
||||||
|
.filter(Lexeme.lemma == headword_text_query).all()
|
||||||
|
|
||||||
# all lexical_unit_ids equal or at least one dummy
|
# all lexical_unit_ids equal or at least one dummy
|
||||||
|
dummy_exists = False
|
||||||
final_lexical_unit_id = 0
|
final_lexical_unit_id = 0
|
||||||
final_lexical_unit_lexeme_id = 0
|
final_lexical_unit_lexeme_id = 0
|
||||||
for r in query_res:
|
for r in dummy_query:
|
||||||
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy,
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, dummy,
|
||||||
lexical_unit_type_name) = r
|
lexical_unit_type_name) = r
|
||||||
if dummy:
|
if dummy:
|
||||||
final_lexical_unit_id = lexical_unit_id
|
final_lexical_unit_id = lexical_unit_id
|
||||||
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
|
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
|
||||||
|
dummy_exists = True
|
||||||
break
|
break
|
||||||
|
assert dummy_exists
|
||||||
|
|
||||||
|
sense_ids = []
|
||||||
|
features_set = set()
|
||||||
|
frequency = 0
|
||||||
|
for r in query_res:
|
||||||
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, el_frequency, dummy,
|
||||||
|
lexical_unit_type_name) = r
|
||||||
|
if dummy:
|
||||||
|
continue
|
||||||
|
sense_ids.extend(session.query(Sense.id, Sense.potential_sense).filter(
|
||||||
|
Sense.lexical_unit_id == lexical_unit_id).all())
|
||||||
|
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
||||||
|
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
||||||
|
.filter(Feature.name == 'aspect').all()
|
||||||
|
|
||||||
|
# set features in dictionary
|
||||||
|
if not features:
|
||||||
|
for n_feat in features_set:
|
||||||
|
for f in n_feat:
|
||||||
|
features.add(f)
|
||||||
|
# compare features
|
||||||
|
else:
|
||||||
|
for n_feat in features_set:
|
||||||
|
for f in n_feat:
|
||||||
|
if f not in features:
|
||||||
|
raise Exception('Different features in query_res - might be problematic!')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
frequency += el_frequency
|
||||||
|
|
||||||
|
# check if any actual sense exists if not erase all but one
|
||||||
|
any_sense_not_dummy = any([not sense[1] for sense in sense_ids])
|
||||||
|
if not any_sense_not_dummy:
|
||||||
|
sense_ids = sense_ids[-1:]
|
||||||
|
|
||||||
|
|
||||||
lexical_unit_id = final_lexical_unit_id
|
lexical_unit_id = final_lexical_unit_id
|
||||||
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
|
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
|
||||||
|
# sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
|
||||||
|
# Sense.lexical_unit_id == lexical_unit_id).all()
|
||||||
|
# features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
||||||
|
# .filter(LexemeFeature.lexeme_id == lexeme_id) \
|
||||||
|
# .filter(Feature.name == 'aspect').all()
|
||||||
else:
|
else:
|
||||||
frequency = 0
|
frequency = None
|
||||||
lexeme_id = 0
|
lexeme_id = 0
|
||||||
lexical_unit_id = 0
|
lexical_unit_id = 0
|
||||||
lexical_unit_lexeme_id = 0
|
lexical_unit_lexeme_id = 0
|
||||||
lexical_unit_type_name = ''
|
lexical_unit_type_name = ''
|
||||||
|
sense_ids = []
|
||||||
|
features = []
|
||||||
|
|
||||||
sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all()
|
|
||||||
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
||||||
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
|
||||||
.filter(Feature.name == 'aspect').all()
|
|
||||||
|
|
||||||
entry = lxml.SubElement(dictionary, 'entry')
|
entry = lxml.SubElement(dictionary, 'entry')
|
||||||
|
|
||||||
|
@ -875,23 +941,33 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
|
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
|
||||||
else:
|
else:
|
||||||
category.text = category_text
|
category.text = category_text
|
||||||
|
ssj_frequency = None
|
||||||
|
if len(features) > 0:
|
||||||
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
|
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
|
||||||
|
ssj_frequency = ssj_frequency_dict[(headword_text, features[0].value)] if (headword_text, features[0].value) in ssj_frequency_dict else None
|
||||||
if args.language == 'sl':
|
if args.language == 'sl':
|
||||||
grammarFeature.set('name', 'vid')
|
grammarFeature.set('name', 'vid')
|
||||||
grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[
|
if len(features) > 1:
|
||||||
0].value in ASPECT_MAP else ''
|
print(features)
|
||||||
|
grammarFeature.text = ASPECT_MAP[features[0].value]
|
||||||
else:
|
else:
|
||||||
grammarFeature.set('name', 'aspect')
|
grammarFeature.set('name', 'aspect')
|
||||||
grammarFeature.text = features[0].value if len(features) > 0 else ''
|
grammarFeature.text = features[0].value
|
||||||
|
|
||||||
measureList = lxml.SubElement(head, 'measureList')
|
measureList = lxml.SubElement(head, 'measureList')
|
||||||
|
if frequency:
|
||||||
measure = lxml.SubElement(measureList, 'measure')
|
measure = lxml.SubElement(measureList, 'measure')
|
||||||
measure.set('type', 'frequency')
|
measure.set('type', 'frequency')
|
||||||
# TODO Modify this!
|
|
||||||
measure.set('source', 'Gigafida 2.0')
|
measure.set('source', 'Gigafida 2.0')
|
||||||
# measure.set('source', 'ssj500k')
|
# measure.set('source', 'ssj500k')
|
||||||
measure.text = str(int(frequency))
|
measure.text = str(int(frequency))
|
||||||
|
|
||||||
|
if ssj_frequency is not None:
|
||||||
|
measure = lxml.SubElement(measureList, 'measure')
|
||||||
|
measure.set('type', 'frequency')
|
||||||
|
measure.set('source', 'ssj500k 2.2')
|
||||||
|
measure.text = str(int(ssj_frequency))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -956,18 +1032,26 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
|
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
|
||||||
|
|
||||||
senseList = lxml.SubElement(body, 'senseList')
|
senseList = lxml.SubElement(body, 'senseList')
|
||||||
|
|
||||||
|
# handle cases when headword is not in sloleks
|
||||||
|
if len(sense_ids) == 0:
|
||||||
|
sense_ids = [-1]
|
||||||
|
|
||||||
for sense_id in sense_ids:
|
for sense_id in sense_ids:
|
||||||
if len(sense_ids) > 1 and sense_id.dummy:
|
if len(sense_ids) > 1 and sense_id.potential_sense:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
sense = lxml.SubElement(senseList, 'sense')
|
sense = lxml.SubElement(senseList, 'sense')
|
||||||
if not sense_id.dummy:
|
if not sense_id == -1 and not sense_id.potential_sense:
|
||||||
sense.set('id', str(sense_id.id))
|
sense.set('id', str(sense_id.id))
|
||||||
|
|
||||||
definitionList = lxml.SubElement(sense, 'definitionList')
|
definitionList = lxml.SubElement(sense, 'definitionList')
|
||||||
|
|
||||||
|
if not sense_id == -1:
|
||||||
definition_texts = session.query(Definition.description).filter(
|
definition_texts = session.query(Definition.description).filter(
|
||||||
Definition.sense_id == sense_id.id).all()
|
Definition.sense_id == sense_id.id).all()
|
||||||
|
else:
|
||||||
|
definition_texts = []
|
||||||
|
|
||||||
for definition_text in definition_texts:
|
for definition_text in definition_texts:
|
||||||
definition = lxml.SubElement(definitionList, 'definition')
|
definition = lxml.SubElement(definitionList, 'definition')
|
||||||
|
@ -1050,11 +1134,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
for syn_struct_id, component_dict in syntactic_structure_dict.items():
|
for syn_struct_id, component_dict in syntactic_structure_dict.items():
|
||||||
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
|
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
|
||||||
syntacticStructure.set('id', syn_struct_id)
|
syntacticStructure.set('id', syn_struct_id)
|
||||||
|
dedup_dict = {}
|
||||||
for comp_id, lexemes in component_dict.items():
|
for comp_id, lexemes in component_dict.items():
|
||||||
for l in lexemes:
|
for l in lexemes:
|
||||||
component = lxml.SubElement(syntacticStructure, 'component')
|
|
||||||
component.set('num', comp_id)
|
|
||||||
lexem = lxml.SubElement(component, 'lexeme')
|
|
||||||
|
|
||||||
if l in preposition_list:
|
if l in preposition_list:
|
||||||
prep_id = preposition_list[l]
|
prep_id = preposition_list[l]
|
||||||
|
@ -1074,6 +1156,13 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
prep_id = str(preposition_ids[0][0])
|
prep_id = str(preposition_ids[0][0])
|
||||||
preposition_list[l] = prep_id
|
preposition_list[l] = prep_id
|
||||||
|
|
||||||
|
if comp_id in dedup_dict and prep_id in dedup_dict[comp_id] and l[2] in dedup_dict[comp_id][prep_id]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
dedup_dict.setdefault(comp_id, {})[prep_id] = l[2]
|
||||||
|
component = lxml.SubElement(syntacticStructure, 'component')
|
||||||
|
component.set('num', comp_id)
|
||||||
|
lexem = lxml.SubElement(component, 'lexeme')
|
||||||
|
|
||||||
lexem.set('sloleks', prep_id)
|
lexem.set('sloleks', prep_id)
|
||||||
lexem.text = l[2]
|
lexem.text = l[2]
|
||||||
|
@ -1100,7 +1189,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
|
|
||||||
|
|
||||||
def init_db(db):
|
def init_db(db):
|
||||||
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
|
||||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
||||||
|
@ -1152,6 +1241,9 @@ def init_db(db):
|
||||||
class FormRepresentation(Base):
|
class FormRepresentation(Base):
|
||||||
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
||||||
|
|
||||||
|
class FormEncoding(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_formencoding']
|
||||||
|
|
||||||
return engine
|
return engine
|
||||||
|
|
||||||
|
|
||||||
|
@ -1415,6 +1507,14 @@ def get_headword_category(collection):
|
||||||
return headword_category
|
return headword_category
|
||||||
|
|
||||||
|
|
||||||
|
def read_ssj500k_frequencies(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
reader = csv.reader(f, delimiter='\t')
|
||||||
|
next(reader)
|
||||||
|
for line in reader:
|
||||||
|
ssj_frequency_dict[(line[1], line[-1])] = line[2]
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
# with Path('data/wordlist.json').open("r") as fp:
|
# with Path('data/wordlist.json').open("r") as fp:
|
||||||
# sskj_wordlist = json.load(fp)
|
# sskj_wordlist = json.load(fp)
|
||||||
|
@ -1518,7 +1618,9 @@ def main(args):
|
||||||
|
|
||||||
print('write_xml')
|
print('write_xml')
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# print('aa ' + 3)
|
if args.ssj500k_frequencies is not None:
|
||||||
|
read_ssj500k_frequencies(args.ssj500k_frequencies)
|
||||||
|
|
||||||
with tqdm(total=len(headword_category)) as pbar:
|
with tqdm(total=len(headword_category)) as pbar:
|
||||||
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
|
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
|
||||||
print(time.time() - start_time)
|
print(time.time() - start_time)
|
||||||
|
@ -1592,6 +1694,9 @@ if __name__ == '__main__':
|
||||||
arg_parser.add_argument('--pc-tag',
|
arg_parser.add_argument('--pc-tag',
|
||||||
help='Tag for separators, usually pc or c', default="pc")
|
help='Tag for separators, usually pc or c', default="pc")
|
||||||
|
|
||||||
|
arg_parser.add_argument('--ssj500k-frequencies',
|
||||||
|
help='Tag for separators, usually pc or c', default=None)
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
args = arg_parser.parse_args()
|
||||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||||
|
|
||||||
|
|
68
scripts/form_csv.py
Normal file
68
scripts/form_csv.py
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
|
def write_general_statistics(path, out_list):
|
||||||
|
if len(out_list) == 0:
|
||||||
|
return
|
||||||
|
with open(path, 'w') as csvfile:
|
||||||
|
writer = csv.writer(csvfile, delimiter='\t',
|
||||||
|
quotechar='"')
|
||||||
|
writer.writerow(['semanticRole', 'valency_pattern_ratio', 'valency_sentence_ratio'])
|
||||||
|
for line in out_list:
|
||||||
|
writer.writerow(line)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
for file in sorted(os.listdir(args.input)):
|
||||||
|
path = os.path.join(args.input, file)
|
||||||
|
tree = etree.parse(path)
|
||||||
|
gf_output = []
|
||||||
|
ssj_output = []
|
||||||
|
head = next(tree.iter('head'))
|
||||||
|
headword = head.find('headword').find('lemma').text
|
||||||
|
#for div in root.iterfind('.//div'):
|
||||||
|
for elem in tree.iter('statisticsContainer'):
|
||||||
|
# for element in tree.iterfind('statisticsContainer'):
|
||||||
|
# for element in tree.find('statisticsContainer'):
|
||||||
|
semRole = elem.find('semanticRole').text
|
||||||
|
gf_pattern = None
|
||||||
|
gf_sentence = None
|
||||||
|
ssj_pattern = None
|
||||||
|
ssj_sentence = None
|
||||||
|
measure = elem.find('measureList')
|
||||||
|
for el in measure:
|
||||||
|
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||||
|
gf_pattern = el.text
|
||||||
|
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||||
|
gf_sentence = el.text
|
||||||
|
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||||
|
ssj_pattern = el.text
|
||||||
|
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||||
|
ssj_sentence = el.text
|
||||||
|
if gf_pattern is not None and gf_sentence is not None:
|
||||||
|
gf_output.append([semRole, gf_pattern, gf_sentence])
|
||||||
|
if ssj_pattern is not None and ssj_sentence is not None:
|
||||||
|
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
|
||||||
|
|
||||||
|
for elem in tree.iter('valencyPattern'):
|
||||||
|
valency_pattern_id = elem.attrib['id']
|
||||||
|
measure = None
|
||||||
|
for measure_el in elem.find('measure'):
|
||||||
|
if measure_el.attrib['source'] == 'Gigafida 2.0':
|
||||||
|
measure = measure_el.text
|
||||||
|
|
||||||
|
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
|
||||||
|
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
||||||
|
arg_parser.add_argument('--input', type=str, help='Input directory')
|
||||||
|
arg_parser.add_argument('--output', type=str, help='Output directory')
|
||||||
|
|
||||||
|
args = arg_parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
|
@ -1 +1 @@
|
||||||
Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd
|
Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd
|
Loading…
Reference in New Issue
Block a user