Source code for framematcher

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Match VerbnetFrameOccurrence to appropriate VerbNetOfficialFrame structure

Based on the 2004 Swier & Stevenson paper: Unsupervised Semantic Role Labeling.

The way frame matching works in Swier&Stevenson 2004 paper is not completely
specified: what happens when VerbNet frame is longer than FrameNet's frame?

Currently, we don't extract any slot when a non-match appears: while it can
make sense to omit objects at the end, it doesn't to omit at the beginning and
take at the end: this could result in a totally different syntactic
construction. But I could be wrong!

For example, if you have in FrameNet "NP V with NP", and VerbNet contains "NP V
NP with NP", what do you do? We decided, for now, to only match the syntactic
subject.
"""

import unittest

from verbnetframe import ComputeSlotTypeMixin, VerbnetFrameOccurrence, VerbnetOfficialFrame
from verbnetrestrictions import VNRestriction


[docs]class FrameMatcher():
    """Handle frame matching for a given frame that we want to annotate.
    
    :var frame_occurrence: VerbnetFrameOccurrence -- The frame to annotate
    :var best_score: int -- The best score encountered among all the matches
    :var best_data: (VerbnetOfficialFrame, int List) List -- The frames that achieved this best score + the mapping between the slots of :frame_occurrence and these verbnet frames
    :var algo: str -- The algorithm that we want to use
    
    """
    
    def __init__(self, frame_occurrence, algo):
        self.frame_occurrence = frame_occurrence
        self.algo = algo
        
        self.best_score = 0
        self.best_data = []
        self.best_classes = set()
    
[docs]    def handle_semantic_restrictions(self, data):
        """Keep only frames for which the syntactic restriction are
        the best matched
        
        :param data: The gathered relations between restrictions and words
        :type data: (VNRestriction -> (str Counter)) NoHashDefaultDict
        
        """
        
        # Nothing to do if no matching have been done yet.
        # Returns early to avoid taking the max of an empty list.
        if len(self.best_data) == 0: return


        scores = [self.frame_semantic_score(x, data) for x in self.best_data]
        assert len(scores) == len(self.best_data)

        self.best_data = [data for data, score in zip(self.best_data, scores)
                if score == max(scores)]
    
[docs]    def frame_semantic_score(self, frame_data, semantic_data):
        """For a given frame from VerbNet, compute a semantic score between
        this frame and the headwords of the real frame associated with
        FrameMatcher.
        
        :param frame_data: The frame and the associated mapping
        :type frame_data: (VerbnetOfficialFrame, int List)
        :param semantic_data: The gathered relations between restrictions and words
        :type semantic_data: (VNRestriction -> (str Counter)) NoHashDefaultDict
        
        """
        
        frame, mapping = frame_data
        score = 0
        for slot1, slot2 in enumerate(mapping):
            if slot2 == None: continue
            if slot2 >= len(frame.role_restrictions): continue
            word = self.frame_occurrence.headwords[slot1]
            restr = frame.role_restrictions[slot2]
            score += restr.match_score(word, semantic_data)
            
        return score
    
[docs]    def get_matched_restrictions(self):
        """Returns the list of restrictions for which we know a given word
        was a match. Only headwords of arguments for which we attributed exactly
        one possible role are taken into account. The restriction associated to
        them is the OR the restrictions associated to this slot in every possible
        frame.
        
        :returns: VNRestriction Dict -- a mapping between head words and the restriction they match 
        
        """
        result = {}
        
        slots = self.possible_distribs()
        for i, slot in enumerate(slots):
            if slot == None or len(slot) != 1: continue
            
            restr = VNRestriction.build_empty()
            for frame, mapping in self.best_data:
                if mapping[i] == None: continue
                if mapping[i] >= len(frame.role_restrictions): continue
                restr = VNRestriction.build_or(restr,
                    frame.role_restrictions[mapping[i]])
            result[self.frame_occurrence.headwords[i]] = restr
            
        return result

    @staticmethod
[docs]    def _is_a_match(frame_occurrence_elem, frame_elem):
        """Tell wether two elements can be considered as a match

        frame_occurrence_elem is a seen element, while frame_elem can contain a
        set of possible elements, such as prepositions
        """

        if isinstance(frame_elem, set):
            return frame_occurrence_elem in frame_elem
        else:
            return frame_occurrence_elem == frame_elem

    
[docs]    def _matching_baseline(self, verbnet_frame, slots_associations):
        """ Matching algorithm that is the closest to the article's method """
        # As slots are not attributed in order, we need to keep a list
        # of the slots that have not been attributed yet
        available_slots = []
        num_match = 0

        for i, x in enumerate(verbnet_frame.slot_types):
            available_slots.append(
                {"slot_type":x, "pos":i, "prep":verbnet_frame.slot_preps[i]}
            )

        for slot_pos, slot_type in enumerate(self.frame_occurrence.slot_types):
            # For every slot, try to find a matching slot in available_slots
            i, matching_slot = -1, -1
                
            for test_slot_data in available_slots:
                # Could have used enumerate, but it looks better like this
                # as i is used after the loop
                i += 1

                # We want a slot that has the same type and the same prep
                # (or a list slot preps containing our preposition)
                if test_slot_data["slot_type"] != slot_type:
                    continue
                if (slot_type == ComputeSlotTypeMixin.slot_types["prep_object"] and
                    not FrameMatcher._is_a_match(
                        self.frame_occurrence.slot_preps[slot_pos],
                        test_slot_data["prep"])
                ):
                    continue
                matching_slot = test_slot_data["pos"]
                break # Stop at the first good slot we find
                
            if matching_slot != -1:
                del available_slots[i] # Slot i has been attributed
                #FIXME : we need to check that enough roles were given in VerbNet
                if len(verbnet_frame.roles) > matching_slot:
                    slots_associations[slot_pos] = matching_slot
                        
                num_match += 1
        return num_match
        
[docs]    def _matching_sync_predicates(self, verbnet_frame, slots_associations):
        """ Stop the algorithm at the first mismatch encountered after the verb,
        restart at the verb's position if a mismatch is encountered before the
        verb """
        
        num_match = 0
        i, j = 0, 0
        index_v_1 = self.frame_occurrence.structure.index("V")
        index_v_2 = verbnet_frame.structure.index("V")
        slot_1, slot_2 = 0, 0
        num_slots_before_v_1 = 0
        num_slots_before_v_2 = 0
            
        for elem in self.frame_occurrence.structure:
            if VerbnetFrameOccurrence._is_a_slot(elem):
                num_slots_before_v_1 += 1
            elif elem == "V": break
        for elem in verbnet_frame.structure:
            if VerbnetFrameOccurrence._is_a_slot(elem):
                num_slots_before_v_2 += 1
            elif elem == "V": break

        while i < len(self.frame_occurrence.structure) and j < len(verbnet_frame.structure):
            elem1 = self.frame_occurrence.structure[i]
            elem2 = verbnet_frame.structure[j]
            
            if FrameMatcher._is_a_match(elem1, elem2):
                if VerbnetFrameOccurrence._is_a_slot(elem1):
                    num_match += 1
                    # verbnet_frame.roles can be too short. This will for instance
                    # happen in the "NP V NP S_INF" structure of want-32.1,
                    # where S_INF is given no role
                    if slot_2 < len(verbnet_frame.roles):
                        slots_associations[slot_1] = slot_2
                        slot_1, slot_2 = slot_1 + 1, slot_2 + 1
            elif i < index_v_1 or j < index_v_2:
                # If we have not encountered the verb yet, we continue the matching
                # with everything that follows the verb
                # This is for instance to prevent a "NP NP V" construct
                # from interrupting the matching early
                i, j = index_v_1, index_v_2
                slot_1, slot_2 = num_slots_before_v_1, num_slots_before_v_2
            else: break
            
            i, j = i + 1, j + 1
        return num_match
        
[docs]    def _matching_stop_on_fail(self, verbnet_frame, slots_associations):
        """ Stop the algorithm at the first mismatch encountered """
        num_match = 0
        for elem1,elem2 in zip(self.frame_occurrence.structure, verbnet_frame.structure):
            if FrameMatcher._is_a_match(elem1, elem2):
                if VerbnetFrameOccurrence._is_a_slot(elem1):
                    num_match += 1
                    if num_match - 1 < len(verbnet_frame.roles):
                        slots_associations[num_match - 1] = num_match - 1
            else: break
        
        return num_match
    
[docs]    def new_match(self, verbnet_frame):
        """Compute the matching score and update the possible roles distribs
            
        :param verbnet_frame: frame to test.
        :type verbnet_frame: VerbnetOfficialFrame.
            
        """
        slots_associations = [None for x in range(self.frame_occurrence.num_slots)]
        
        if self.algo == "baseline":
            matching_function = self._matching_baseline
        elif self.algo == "sync_predicates":
            matching_function = self._matching_sync_predicates
        elif self.algo == "stop_on_fail":
            matching_function = self._matching_stop_on_fail
        else:
            raise Exception("Unknown matching algorithm : {}".format(self.algo))
        
        num_match = matching_function(verbnet_frame, slots_associations)

        # Score computation
        ratio_1 = num_match / self.frame_occurrence.num_slots
        if verbnet_frame.num_slots == 0:
            ratio_2 = 1
        else:
            ratio_2 = num_match / verbnet_frame.num_slots
        score = int(100 * (ratio_1 + ratio_2))

        if score > self.best_score:
            # This frame is better than any previous one : reset everything
            self.best_data = []
            self.best_classes = set()
        if score >= self.best_score:
            self.best_score = score
            # This frame got the best score : add its data
            self.best_data.append((verbnet_frame, slots_associations))
            self.best_classes.add(verbnet_frame.vnclass)
    
[docs]    def possible_distribs(self):
        """Compute the lists of possible roles for each slots
        
        :returns: str set list -- The lists of possible roles for each slot
        """
        
        result = [set() for x in range(self.frame_occurrence.num_slots)]
        
        for frame, mapping in self.best_data:
            for slot1, slot2 in enumerate(mapping):
                if slot2 == None: continue
                role = next(iter(frame.roles[slot2]))
                result[slot1].add(role)
        
        return result
        
[docs]class frameMatcherTest(unittest.TestCase):
[docs]    def test_1(self):
        frame_occurrence = VerbnetFrameOccurrence(["NP", "V", "NP", "with", "NP"], [None, None, None], "a predicate")
        frame2 = VerbnetOfficialFrame(["NP", "V", "NP", "for", "NP"], ["Agent", "Patient", "Role1"], "a", [])
        frame3 = VerbnetOfficialFrame(["NP", "V", "NP", "with", "NP"], ["Agent", "Patient", "Role2"], "b", [])
        frame4 = VerbnetOfficialFrame(["NP", "V", "NP", "with", "NP"], ["Agent", "Patient", "Role3"], "c", [])

        matcher = FrameMatcher(frame_occurrence, "sync_predicates")
        matcher.new_match(frame2)
        self.assertEqual(matcher.best_score, int(100 * 4 / 3))
        matcher.new_match(frame3)
        matcher.new_match(frame4)
        self.assertEqual(matcher.best_score, 200)
        self.assertEqual(matcher.possible_distribs(), [{"Agent"}, {"Patient"}, {"Role2", "Role3"}])
        
[docs]    def test_2(self):
        frame_occurrence = VerbnetFrameOccurrence(["to", "be"], [], "a predicate")
        frame = VerbnetOfficialFrame(["NP", "V", "NP", "with", "NP"], ["Agent", "Patient", "Role3"], "X", [])

        self.assertEqual(frame_occurrence.num_slots, 0)
            
[docs]    def test_3(self):
        frame_occurrence = VerbnetFrameOccurrence(["NP", "V", "with", "NP"], [None, None], "a predicate")
        frame = VerbnetOfficialFrame(["NP", "V", "NP", "with", "NP"], ["Agent", "Patient", "Role3"], "XX", [])

        matcher = FrameMatcher(frame_occurrence, "sync_predicates")
        matcher.new_match(frame)
        self.assertEqual(matcher.best_score, int(100 / 2 + 100 / 3))
        
[docs]    def test_4(self):
        frame_occurrence = VerbnetFrameOccurrence(['NP', 'V', 'NP'], [None, None], "a predicate")
        matcher = FrameMatcher(frame_occurrence, "sync_predicates")
        verbnet_frames = [
            VerbnetOfficialFrame(['NP', 'V', 'NP'], ['Agent', 'Theme'], "XX", []),
            VerbnetOfficialFrame(['NP', 'V', 'NP'], ['Agent', 'Theme'], "XX", []),
            VerbnetOfficialFrame(['NP', 'V'], ['Theme'], "XX", []),
            VerbnetOfficialFrame(['NP', 'V', 'NP'], ['Agent', 'Theme'], "XX", []),
            VerbnetOfficialFrame(['NP', 'V', {'with'}, 'NP'], ['Theme', 'Instrument'], "XX", []),
            VerbnetOfficialFrame(['NP', 'V', 'NP', {'with'}, 'NP'], ['Agent', 'Theme', 'Instrument'], "XX", []),
            VerbnetOfficialFrame(['NP', 'V', 'NP'], ['Instrument', 'Theme'], "XX", [])
        ]
        for verbnet_frame in verbnet_frames:
            matcher.new_match(verbnet_frame)
            
        self.assertEqual(matcher.possible_distribs(), [{"Agent", "Instrument"}, {"Theme"}])
     
                 
[docs]    def test_baseline_alg(self):
        frame_occurrence = VerbnetFrameOccurrence(['NP', 'V', 'NP', 'NP', 'for', 'NP'], [None, None, None, None], "a predicate")
        
        verbnet_frames = [
            VerbnetOfficialFrame(['NP', 'V', 'NP', 'by', 'NP'], ['R1', 'R2', 'R3'], "XX", []),
            VerbnetOfficialFrame(['NP', 'V', 'NP', {'for', 'as'}, 'NP'], ['R1', 'R4', 'R5'], "XX", [])
        ]
        matcher = FrameMatcher(frame_occurrence, "baseline")
        for verbnet_frame in verbnet_frames:
            matcher.new_match(verbnet_frame)
        self.assertEqual(matcher.possible_distribs(), [{"R1"}, {"R4"}, set(), {"R5"}])