Source code for verbnetframe

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import unittest
from abc import ABCMeta

from framestructure import FrameInstance, Predicate, Arg, Word
import verbnetprepclasses

[docs]class ComputeSlotTypeMixin(metaclass=ABCMeta): slot_types = { "subject": "SBJ", "object": "OBJ", "indirect_object": "OBJI", "prep_object": "PPOBJ" }
[docs] def compute_slot_types(self, structure): """Build the list of slot types for this frame""" slot_types, slot_preps = [], [] # The next slot we are expecting : # always subject before the verb, object immediatly after the verb # and indirect_object after we encoutered a slot for object next_expected = ComputeSlotTypeMixin.slot_types["subject"] # If last structure element was a preposition, this will be filled # with the preposition and will "overwrite" :next_expected preposition = "" for element in structure: if element == "V": next_expected = ComputeSlotTypeMixin.slot_types["object"] elif self._is_a_slot(element): if preposition != "": slot_types.append(ComputeSlotTypeMixin.slot_types["prep_object"]) slot_preps.append(preposition) preposition = "" else: slot_types.append(next_expected) slot_preps.append(None) if next_expected == ComputeSlotTypeMixin.slot_types["object"]: next_expected = ComputeSlotTypeMixin.slot_types["indirect_object"] elif isinstance(element, set) or element in verbnetprepclasses.all_preps: preposition = element return slot_types, slot_preps
@staticmethod
[docs] def _is_a_slot(elem): """Tell wether an element represent a slot :param elem: The element. :type elem: str. :returns: bool -- True if elem represents a slot, False otherwise """ return isinstance(elem, str) and elem[0].isupper() and elem != "V"
[docs]class VerbnetFrameOccurrence(ComputeSlotTypeMixin): """A representation of a FrameNet frame occurrence converted to VerbNet representation for easy comparison. :var structure: (str | str set) list -- representation of the structure :var roles: set list -- possible VerbNet roles for each structure's slot :var num_slots: int -- number of argument slots in :structure :var predicate: str -- the predicate :var headwords: str -- the head word of each argument """ phrase_replacements = { "N": "NP", "Poss": "NP", "QUO": "S", "Sinterrog": "S", "Sfin": "S", "VPbrst": "S", "VPing": "S_ING", "VPto": "to S" } def __init__(self, structure, roles, predicate): self.structure = structure self.predicate = predicate # Transform "a" in {"a"} and keep everything else unchanged self.roles = [{x} if isinstance(x, str) else x for x in roles] self.num_slots = len(self.roles) self.slot_types, self.slot_preps = self.compute_slot_types(structure) self.headwords = [] self.best_classes = None def __eq__(self, other): return (isinstance(other, self.__class__) and self.structure == other.structure and self.roles == other.roles and self.num_slots == other.num_slots and self.predicate == other.predicate) def __repr__(self): return "VerbnetFrameOccurrence({}, {}, {})".format( self.predicate, self.structure, self.roles) @staticmethod
[docs] def build_from_frame(gold_framenet_instance, conll_frame_instance): """Build a VerbNet frame from a FrameInstance object :param gold_framenet_instance: The gold FrameNet frame instance :type frame: FrameInstance. :param conll_frame_instance: The frame instance from CoNLL :type frame: FrameInstance. :returns: VerbnetFrameOccurrence -- the frame without the gold roles converted to VerbNet-style representation """ num_slots = 0 # The goal here is to translate a FrameInstance into a VerbnetFrameOccurrence. # We do this in a number of steps # TODO split the method for two usages if conll_frame_instance is not None: sentence = conll_frame_instance.sentence elif gold_framenet_instance is not None: sentence = gold_framenet_instance.sentence else: raise Exception('Either conll_frame_instance or gold_framenet_instance should exist.') # First, only keep the text segments with arguments and predicates begin = gold_framenet_instance.predicate.begin end = gold_framenet_instance.predicate.end for argument in gold_framenet_instance.args: if not argument.instanciated: continue num_slots += 1 if argument.begin < begin: begin = argument.begin if argument.end > end: end = argument.end structure = sentence[begin:end + 1] # Then, replace the predicate/arguments by their phrase type structure = VerbnetFrameOccurrence._reduce_args(gold_framenet_instance, structure, begin) # And delete everything else, except some keywords structure = VerbnetFrameOccurrence._keep_only_keywords(structure) # Transform the structure into a list structure = structure.split(" ") result = VerbnetFrameOccurrence(structure, [], predicate=gold_framenet_instance.predicate.lemma) result.num_slots = num_slots # Finally, fill the role list with None value result.roles = [None] * num_slots # If the FrameInstance only comes from a CoNLL file and is not part of # the corpus, we don't want to loose predicate/args position in the # file so that we can add classes and roles later # TODO remove this condition and reorganize caller code instead if conll_frame_instance is not None: result.predicate_position = conll_frame_instance.predicate.position result.args = conll_frame_instance.args result.sentence_id = conll_frame_instance.sentence_id return result
@staticmethod
[docs] def _reduce_args(frame, structure, new_begin): """Replace the predicate and the argument of a frame by phrase type marks :param frame: The original Frame. :type frame: Frame. :param structure: The current structure representation. :type structure: str. :param new_begin: The left offset cause by previous manipulations. :type new_begin: int. :returns: String -- the reduced string """ predicate_begin = frame.predicate.begin - new_begin predicate_end = frame.predicate.end - new_begin for argument in reversed(frame.args): if not argument.instanciated: continue phrase_type = argument.phrase_type if phrase_type in VerbnetFrameOccurrence.phrase_replacements: phrase_type = VerbnetFrameOccurrence.phrase_replacements[phrase_type] before = structure[0:argument.begin - new_begin] after = structure[1 + argument.end - new_begin:] arg_first_word = argument.text.lower().split(" ")[0] # Fix some S incorrectly marked as PP if (phrase_type == "PP" and arg_first_word in verbnetprepclasses.sub_pronouns): added_length = 8 + len(arg_first_word) structure = "{} || {} S| {}".format(before, arg_first_word, after) # Replace every "PP" by "prep NP" elif phrase_type == "PP": prep = "" for word in argument.text.lower().split(" "): if word in verbnetprepclasses.keywords: prep = word break if prep == "": prep = arg_first_word added_length = 9 + len(prep) structure = "{} || {} NP| {}".format(before, prep, after) # Replace every "PPing" by "prep S_ING", elif phrase_type == "PPing": prep = "" for word in argument.text.lower().split(" "): if word in verbnetprepclasses.keywords: prep = word break if prep == "": prep = arg_first_word added_length = 12 + len(prep) structure = "{} || {} S_ING| {}".format(before, prep, after) # Replace every "Swhether" and "S" by "that S", "if S", ... elif phrase_type in ["Swhether", "Sub"]: added_length = 8 + len(arg_first_word) structure = "{} || {} S| {}".format(before, arg_first_word, after) else: added_length = 6 + len(phrase_type) structure = "{} || {}| {}".format(before, phrase_type, after) # Compute the new position of the predicate if we reduced an # argument before it if argument.begin - new_begin < predicate_begin: offset = (argument.end - argument.begin + 1) - added_length predicate_begin -= offset predicate_end -= offset structure = "{} || V| {}".format( structure[0:predicate_begin], structure[1+predicate_end:]) return structure
@staticmethod
[docs] def _keep_only_keywords(sentence): """Keep only keywords and phrase type markers in the structure :param sentence: The structure to reduce. :type sentence: str. :returns: String -- the reduced string """ pos = 0 last_pos = len(sentence) - 1 inside_tag = 0 closing_tag = False result = "" while pos < last_pos: if inside_tag == 2 and sentence[pos] == "|": inside_tag = 0 closing_tag = True if inside_tag == 2: result += sentence[pos] pos += 1 continue if not closing_tag and sentence[pos] == "|": inside_tag += 1 else: inside_tag = 0 closing_tag = False for search in verbnetprepclasses.external_lexemes: if (search == sentence[pos:pos + len(search)].lower() and (pos == 0 or sentence[pos - 1] == " ") and (pos + len(search) == len(sentence) or sentence[pos + len(search)] == " ")): pos += len(search) - 1 result += " "+search pos += 1 if result[0] == " ": result = result[1:] if result[-1] == " ": result = result[:-1] return result
[docs]class VerbnetOfficialFrame(ComputeSlotTypeMixin): """A representation of a frame syntactic structure :var structure: (str | str set) List -- representation of the structure :var roles: str list -- VerbNet roles for each structure's slot :var num_slots: int -- number of argument slots in :structure :var vnclass: str -- the class number, eg. 9.10 :var example: str -- An example sentence that illustrates the frame """ def __init__(self, structure, roles, vnclass, role_restrictions): self.structure = structure # Transform "a" in {"a"} and keep everything else unchanged self.roles = [{x} if isinstance(x, str) else x for x in roles] self.num_slots = len(self.roles) self.role_restrictions = role_restrictions self.slot_types, self.slot_preps = self.compute_slot_types(structure) self.vnclass = vnclass def __eq__(self, other): return (isinstance(other, self.__class__) and self.structure == other.structure and self.roles == other.roles and self.num_slots == other.num_slots and self.vnclass == other.vnclass) def __repr__(self): return "VerbnetOfficialFrame({}, {}, {})".format( self.vnclass, self.structure, self.roles)
[docs] def passivize(self): """ Based on current frame, return a list of possible passivizations """ passivizedframes = [] # Find the position of the first slot following the verb and # the last element of the first slot of the frame slot_position = 0 old_sbj_end = 0 first_slot = True for i, element in enumerate(self.structure): if first_slot: old_sbj_end = i if VerbnetOfficialFrame._is_a_slot(element): first_slot = False slot_position += 1 if element == "V": break # Find the first and last element of the first slot following the verb index_v = self.structure.index("V") new_sbj_begin, new_sbj_end = index_v + 1, index_v + 1 while True: if new_sbj_end >= len(self.structure): return [] if VerbnetOfficialFrame._is_a_slot(self.structure[new_sbj_end]): break new_sbj_end += 1 # Build the passive frame without "by" frame_without_agent = VerbnetOfficialFrame( (self.structure[new_sbj_begin:new_sbj_end+1] + self.structure[old_sbj_end+1:index_v] + ["V"] + self.structure[new_sbj_end+1:]), ([self.roles[slot_position]] + self.roles[1:slot_position] + self.roles[slot_position+1:]), vnclass=self.vnclass, role_restrictions=self.role_restrictions ) passivizedframes.append(frame_without_agent) # Add the frames obtained by inserting "by + the old subject" # after the verb and every slot that follows it new_index_v = frame_without_agent.structure.index("V") i = new_index_v slot = slot_position - 1 while i < len(frame_without_agent.structure): elem = frame_without_agent.structure[i] if self._is_a_slot(elem) or elem == "V": passivizedframes.append(VerbnetOfficialFrame( (frame_without_agent.structure[0:i+1] + ["by"] + self.structure[0:old_sbj_end+1] + frame_without_agent.structure[i+1:]), (frame_without_agent.roles[0:slot+1] + [self.roles[0]] + frame_without_agent.roles[slot+1:]), vnclass=self.vnclass, role_restrictions=self.role_restrictions )) slot += 1 i += 1 return passivizedframes
[docs] def generate_relatives(self): relatives = [] i_slot = 0 for i, element in enumerate(self.structure): if VerbnetOfficialFrame._is_a_slot(element): j = i - 1 while j >= 0 and self.structure[j][0].islower(): j -= 1 structure = (self.structure[j+1:i+1] + self.structure[0:j+1] + self.structure[i+1:]) roles = ([self.roles[i_slot]] + self.roles[0:i_slot] + self.roles[i_slot+1:]) relatives.append( VerbnetOfficialFrame(structure, roles, vnclass=self.vnclass, role_restrictions=self.role_restrictions)) i_slot += 1 return relatives