#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" Implements the probability models proposed in the article to make a choice
in slot where frame matching left several possible roles.
There are four possible models :
* default does not use any collected data, nor the list of possible roles
and makes default assignement depending on the slot class
* slot_class choose the most likely of the possible roles given the slot
class of the slot (the difference with default it is guaranteed that the
chosen role will be in the list of possible roles for this slot)
* slot choose the most likely of the possible roles given the slot type
(that is, the slot class, but with the PP class is divided into one class
for each preposition)
* predicate_slot choose the most likely of the possible roles given the
slot type and the predicate
"""
import unittest
import math
from verbnetframe import ComputeSlotTypeMixin
from collections import defaultdict
from functools import reduce
NO_PREP = "no_prep_magic_value"
models = ["default", "slot_class", "slot", "predicate_slot", "vnclass_slot"]
[docs]def multi_get(d, l, default = None):
"""Traverses multiple levels of a dictionary to get a key or None"""
if not d: return default
result = reduce(lambda d,k: d.get(k) if d else default, l, d)
return result if result else default
[docs]def multi_default_dict(dimension):
"""Returns an empty int defaultdict of a given dimension"""
if dimension <= 1: return defaultdict(int)
else: return defaultdict(lambda: multi_default_dict(dimension - 1))
[docs]def multi_count(obj):
"""Returns the sum of all integers in a multidict"""
if isinstance(obj, int) or isinstance(obj, float): return obj
else: return sum([multi_count(x) for x in obj.values()])
[docs]def check_depth(data, depth):
is_scalar = isinstance(data, int) or isinstance(data, float)
if depth == 0: return is_scalar
if is_scalar: return False
return all([check_depth(x, depth - 1) for x in data.values()])
[docs]def root_vnclass(vnclass):
position = vnclass.find("-")
if position == -1: return vnclass
return vnclass[0:position]
[docs]class ProbabilityModel:
"""Class used to collect data and apply one probability model
:var data_default: str. Dict The default assignements
:var data_slot_class: str. 2D Dict The number of occurences of each role in every slot class
:var data_slot: str. 3D Dict The number of occurences of each role in every slot
:var data_slot: str. 4D Dict The number of occurences of each role in every (slot, predicate)
"""
guess_good = 1
guess_unknown = 0
guess_bad = -1
def __init__(self, vn_classes = None, vn_init_value = None):
self.data_default = {
ComputeSlotTypeMixin.slot_types["subject"]:"Agent",
ComputeSlotTypeMixin.slot_types["object"]:"Theme",
ComputeSlotTypeMixin.slot_types["indirect_object"]:"Recipient",
ComputeSlotTypeMixin.slot_types["prep_object"]:"Location"
}
self.data_slot_class = multi_default_dict(2)
self.data_slot = multi_default_dict(3)
self.data_predicate_slot = multi_default_dict(4)
self.data_bootstrap_p = multi_default_dict(5)
self.data_bootstrap_p1 = multi_default_dict(3)
self.data_bootstrap_p2 = multi_default_dict(3)
self.data_bootstrap_p3 = multi_default_dict(4)
self.data_bootstrap_p1_sum = multi_default_dict(2)
self.data_bootstrap_p2_sum = multi_default_dict(2)
self.data_bootstrap_p3_sum = multi_default_dict(3)
self.data_vnclass_slot = multi_default_dict(4)
if vn_classes != None and vn_init_value != None:
self.data_vnclass = defaultdict(lambda : {})
for verb, verb_vnclass in vn_classes.items():
for vnclass in verb_vnclass:
vnclass = root_vnclass(vnclass)
self.data_vnclass[verb][vnclass] = vn_init_value
[docs] def add_data(self, slot_class, role, prep, predicate, vnclass = None):
"""Use one known occurence of a role in a given context to update the data
of every model
:param slot_class: The slot class of the slot where the role occured
:type slot_class: str
:param role: The role that occured
:type role: str
:param prep: The preposition which introduced the slot if it was a PP slot
:type prep: str
:param predicate: The predicate of which the slot was an argument
:type predicate: str
:param vnclass: The VerbNet class of the predicate
:type vnclass: None | str
"""
self.data_slot_class[slot_class][role] += 1
if slot_class == ComputeSlotTypeMixin.slot_types["prep_object"]:
self.data_slot[slot_class][prep][role] += 1
self.data_predicate_slot[predicate][slot_class][prep][role] += 1
if vnclass != None:
self.data_vnclass_slot[vnclass][slot_class][prep][role] += 1
else:
self.data_slot[slot_class][NO_PREP][role] += 1
self.data_predicate_slot[predicate][slot_class][NO_PREP][role] += 1
if vnclass != None:
self.data_vnclass_slot[vnclass][slot_class][NO_PREP][role] += 1
[docs] def add_data_bootstrap(self, role, predicate, predicate_classes,
slot_class, prep, headword, headword_class):
"""Use one known occurence of a role in a given context to update the data
of the bootstrap algorithm
:param role: The attributed role
:type role: str
:param predicate: The predicate of which the slot is an argument
:type predicate: str
:param predicate_classes: The VerbNet classes of the predicate
:type predicate_classes: str List
:param slot_class: The slot class of the slot we want to resolve
:type slot_class: str
:param prep: If the slot is a PP, the preposition that introduced it
:type prep: str
:param headword: The headword of the argument
:type headword: str:
param headword_class: The WordNet class of the headword
:type headword_class: str:
"""
if not slot_class == ComputeSlotTypeMixin.slot_types["prep_object"]:
prep = NO_PREP
# Most specific
self.data_bootstrap_p[slot_class][prep][predicate][headword][role] += 1
# First backoff level
self.data_bootstrap_p1[slot_class][predicate][role] += 1
self.data_bootstrap_p2[predicate][headword_class][role] += 1
self.data_bootstrap_p1_sum[slot_class][predicate] += 1
self.data_bootstrap_p2_sum[predicate][headword_class] += 1
# For verbs with multiple posible VerbNet classes, the score is
# uniformly repartited amon every classes
increment = 1 / len(predicate_classes)
for vn_class in predicate_classes:
self.data_bootstrap_p3[slot_class][prep][vn_class][role] += increment
self.data_bootstrap_p3_sum[slot_class][prep][vn_class] += increment
# Second backoff level
self.data_slot_class[slot_class][role] += 1
[docs] def stats_vnclass(self):
sums = defaultdict(int)
f_max = defaultdict(int)
weights = defaultdict(int)
num_encountered = 0
for verb, vnclasses in self.data_vnclass.items():
total = sum([x for x in vnclasses.values()])
if total == 0: continue
num_encountered += 1
if len(vnclasses) < 2: continue
freq = [x / total for x in vnclasses.values()]
v = sum([(x - (1 / len(vnclasses))) ** 2 for x in freq]) / len(vnclasses)
std = math.sqrt(v)
sums[len(vnclasses)] += std
f_max[len(vnclasses)] += max(freq)
weights[len(vnclasses)] += 1
print(
"{} verbs in VerbNet\n"
"{} verbs encountered\n".format(
len(self.data_vnclass), num_encountered))
for n, sigma in sums.items():
print("Verbes à {} classes ({} verbes) : std={}, fmax={}".format(
n, weights[n], sigma / weights[n], f_max[n] / weights[n]))
print("Fréquence max moyenne : {}".format(
sum(f_max.values()) / sum(weights.values())))
[docs] def add_data_vnclass(self, matcher):
"""Fill data_vnclass using the data of a framematcher object
:param matcher: A frame matcher after at least one matching
:type matcher: FrameMatcher
"""
verb = matcher.frame_occurrence.predicate
vnclass = None
for frame, junk in matcher.best_data:
if vnclass == None:
vnclass = root_vnclass(frame.vnclass)
elif vnclass != root_vnclass(frame.vnclass):
vnclass = None
break
if vnclass != None:
vnclass = root_vnclass(vnclass)
self.data_vnclass[verb][vnclass] += 1
return vnclass
[docs] def check_vnclass_guess(self, predicate, frame_name, role_matcher):
class_data = self.data_vnclass[predicate]
guess = max(class_data, key=class_data.get)
frame_data = role_matcher.fn_frames[frame_name]
if guess in frame_data:
return ProbabilityModel.guess_good
if any([x in frame_data for x in class_data.keys()]):
return ProbabilityModel.guess_bad
return ProbabilityModel.guess_unknown
[docs] def best_role(self, role_set, slot_class, prep, predicate, model):
"""Apply one probability model to resolve one slot
:param role_set: The set of possible roles left by frame matching
:type role_set: str Set
:param slot_class: The slot class of the slot we want to resolve
:type slot_class: str
:param prep: If the slot is a PP, the preposition that introduced it
:type prep: str
:param predicate: The predicate of which the slot is an argument
:type predicate: str
:param model: The model that we want to apply
:type model: str
"""
if slot_class != ComputeSlotTypeMixin.slot_types["prep_object"]:
final_prep = NO_PREP
else:
final_prep = prep
if model == "default":
return self.data_default[slot_class]
elif model == "slot_class":
data = self.data_slot_class.get(slot_class)
elif model == "slot":
data = multi_get(self.data_slot, [slot_class, final_prep])
elif model == "predicate_slot":
data = multi_get(self.data_predicate_slot, [predicate, slot_class, final_prep])
elif model == "vnclass_slot":
data = defaultdict(int)
total_vnclass = sum(self.data_vnclass[predicate].values())
if total_vnclass == 0: return None
for vnclass, n_vnclass in self.data_vnclass[predicate].items():
subdata = multi_get(self.data_vnclass_slot,
[vnclass, slot_class, final_prep], {})
total_role = sum(subdata.values())
for role, n_role in subdata.items():
data[role] += (n_role / total_role) * (n_vnclass / total_vnclass)
else:
raise Exception("Unknown model {}".format(model))
if data:
possible_roles = sorted(list(set(data.keys()) & role_set))
if possible_roles:
return max(possible_roles, key = lambda role: data[role])
return None
[docs] def best_roles_bootstrap(self, role_set, predicate, predicate_classes, slot_class,
prep, headword, headword_class, backoff_level, min_evidence):
"""Computes the two best roles for a slot at a given backoff level
of the bootstrap algorithm
:param role_set: The set of possible roles left by frame matching
:type role_set: str Set
:param predicate: The predicate of which the slot is an argument
:type predicate: str
:param predicate_classes: The VerbNet classes of the predicate
:type predicate_classes: str List
:param slot_class: The slot class of the slot we want to resolve
:type slot_class: str
:param prep: If the slot is a PP, the preposition that introduced it
:type prep: str
:param headword: The headword of the argument
:type headword: str:
param headword_class: The WordNet class of the headword
:type headword_class: str:
param backoff_level: The backoff level
:type backoff_level: int
:param min_evidence: The minimum number of occurences that a role must have to be returned
:type min_evidence: int
:returns (str, str, float) -- The two roles and their probability ratio
"""
if not slot_class == ComputeSlotTypeMixin.slot_types["prep_object"]:
prep = NO_PREP
if backoff_level == 0:
data = multi_get(self.data_bootstrap_p,
[slot_class, prep, predicate, headword], {})
data = {x:data[x] for x in data if x in role_set and data[x] >= min_evidence}
elif backoff_level == 1:
data1 = multi_get(self.data_bootstrap_p1,
[slot_class, predicate], {})
data2 = multi_get(self.data_bootstrap_p2,
[predicate, headword_class], {})
sum1 = multi_get(self.data_bootstrap_p1_sum,
[slot_class, predicate], 0)
sum2 = multi_get(self.data_bootstrap_p2_sum,
[predicate, headword_class], 0)
# We still have the problem of verbs with multiple VN classes
# We choose not to give them an equal weight :
# the weight of each class is proportionnal to its number of occurences
# in the already resolved slots : n is not divided by sum(d.values())
data3 = defaultdict(int)
for vn_class in predicate_classes:
d = multi_get(self.data_bootstrap_p3,
[slot_class, prep, vn_class], {})
for role, n in d.items():
data3[role] += n
sum3 = sum(multi_get(self.data_bootstrap_p3_sum,
[slot_class, prep, vn_class], 0)
for vn_class in predicate_classes)
roles = set(data1.keys()) & set(data2.keys()) & set(data3.keys())
roles = list(filter(lambda x: (x in role_set and
data1[x] + data2[x] + data3[x] >= 3 * min_evidence),
roles))
data = {x:(data1[x] / sum1 + data2[x] / sum2 + data3[x] / sum3)
for x in roles}
elif backoff_level == 2:
data = multi_get(self.data_slot_class, [slot_class], {})
data = {x:data[x] for x in data if x in role_set and data[x] >= min_evidence}
else:
raise Exception("Unknown backoff level {}".format(backoff_level))
# At this point, data is a dictionnary that maps every role of :role_set
# that meet the evidence count :min_evidence in the model :backoff_level
# to the number of occurences of this role in the given conditions
# according to the model.
if len(data) == 0:
return None, None, None
first = max(data, key = lambda r: data[r])
if len(data) == 1:
return first, None, 0
second = max(data, key = lambda r: 0 if r == first else data[r])
return first, second, data[first] / data[second]
[docs]class ProbabilityModelTest(unittest.TestCase):
""" Test class for ProbabilityModel """
[docs] def test_1(self):
model = ProbabilityModel()
# No data : best_role should always return None
self.assertEqual(model.best_role(
set(["Agent", "Theme"]), "SUBJ", None, "sleep", "slot_class"), None)
model.add_data("SUBJ", "Theme", "for", "eat")
# Simple test with only one entry in the data
self.assertEqual(model.best_role(
set(["Agent", "Theme"]), "SUBJ", None, "sleep", "slot_class"), "Theme")
model.add_data("SUBJ", "Agent", "against", "drink")
model.add_data("SUBJ", "Agent", "against", "drink")
model.add_data("SUBJ", "Agent", "against", "drink")
# We added more entry for Agent, which should change the result
self.assertEqual(model.best_role(
set(["Agent", "Theme"]), "SUBJ", "without", "sleep", "slot_class"), "Agent")
# Unknown roles should return None
self.assertEqual(model.best_role(
set(["Patient", "Location"]), "SUBJ", None, "sleep", "slot_class"), None)
model.add_data("PPOBJ", "Agent", "with", "eat")
model.add_data("PPOBJ", "Agent", "with", "eat")
model.add_data("PPOBJ", "Agent", "with", "eat")
model.add_data("PPOBJ", "Location", "in", "eat")
model.add_data("PPOBJ", "Location", "to", "eat")
model.add_data("PPOBJ", "Destination", "to", "eat")
model.add_data("PPOBJ", "Destination", "to", "eat")
# The 'slot' model should return None when it never saw the preposition
self.assertEqual(model.best_role(
set(["Agent", "Location"]), "PPOBJ", "without", "sleep", "slot"), None)
# The 'slot_class' model should ignore the preposition
self.assertEqual(model.best_role(
set(["Agent", "Location"]), "PPOBJ", "to", "sleep", "slot_class"), "Agent")
# The 'slot' model should see that 'Location' is more frequent with 'to'
self.assertEqual(model.best_role(
set(["Agent", "Location"]), "PPOBJ", "to", "sleep", "slot"), "Location")
# The model should ignore the preposition, since this is a 'SUBJ' slot
self.assertEqual(model.best_role(
set(["Agent", "Theme"]), "SUBJ", "for", "sleep", "slot"), "Agent")
self.assertEqual(model.best_role(
set(["Agent", "Theme"]), "SUBJ", "for", "eat", "predicate_slot"), "Theme")