Source code for verbnetreader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Read VerbNet and build a list of allowed VerbNet frame for each verb"""

import unittest
import xml.etree.ElementTree as ET
import os
import sys

from errorslog import errors
from verbnetframe import VerbnetOfficialFrame
from verbnetrestrictions import VNRestriction
import verbnetprepclasses
import paths


[docs]class VerbnetReader:

    """Class used to parse VerbNet and build its representation in memory.
    
    :var verbs: Dictionary of VerbnetOfficialFrame lists representing VerbNet.
    """
    
    def __init__(self, path, normalize = False):
        """Read VerbNet and fill verbs with its content.
        
        :param path: Path to VerbNet.
        :type path: str.
        :param normalize: Either stick to VerbNet content closely or make it
            easier for the frame matching to proceed.
        :type normalize: boolean.
        
        """
        
        self.normalize = normalize
        
        self.frames_for_verb = {}
        self.classes = {}
        self.roles = {}
        self.cnames = {}

        # Debug data
        self.filename = ""
        self.unhandled = []

        for filename in os.listdir(path):
            if not filename[-4:] == ".xml": continue

            self.filename = filename
            root = ET.ElementTree(file=path+self.filename)
            self._handle_class(root.getroot(), [], [], [])

        if self.normalize:
            return self._normalized()
    
[docs]    def _normalized(self):
        self.files = {}
        
        for verb, verb_data in self.frames_for_verb.items():
            for vnframe in verb_data:
                filename = self.cnames[vnframe.vnclass][:-4]
                
                if not filename in self.files: self.files[filename] = {
                    "children": [], "roles": set(), "members": [],
                    "frames": [], "name":vnframe.vnclass.split("-")[0]}
                    
                current_class = self.files[filename]
                if "-" in vnframe.vnclass:
                    for subclass in vnframe.vnclass.split("-")[1:]:
                        new_name = current_class["name"]+"-"+subclass
                        
                        matching_class = None
                        for child_class in current_class["children"]:
                            if child_class["name"] == new_name:
                                matching_class = child_class
                        if matching_class == None:
                            new_class = {
                                "children": [], "roles": set(), "members": [],
                                "frames": [], "name":new_name}
                            current_class["children"].append(new_class)
                            matching_class = new_class
                            
                        current_class = matching_class
                
                current_class["members"].append(verb)
                current_class["frames"].append(vnframe)
                for role in vnframe.roles:
                    current_class["roles"].add(next(role.__iter__()))
    
[docs]    def _handle_class(self, xml_class, parent_frames, role_list, restrictions):
        """Parse one class of verbs and all its subclasses.
        
        :param xml_class: XML representation of the class of verbs.
        :type xml_class: xml.etree.ElementTree.Element.
        :param parent_frames: the frame inherited from the parent class.
        :type parent_frames: VerbnetOfficialFrame list.
        
        """
        frames = parent_frames[:]
        role_list = role_list[:]
        restrictions = restrictions[:]
        
        vnclass = xml_class.attrib["ID"]
        self.cnames[vnclass] = self.filename
        
        for xml_role in xml_class.find("THEMROLES"):
            role_list.append(xml_role.attrib["type"])
            restrictions.append(
                VNRestriction.build_from_xml(xml_role.find("SELRESTRS")))
        
        self.roles[vnclass] = role_list

        for xml_frame in xml_class.find("FRAMES"):
            frames += self._build_frame(xml_frame, vnclass, role_list, restrictions)
        
        for xml_verb in xml_class.find("MEMBERS"):
            verb = xml_verb.attrib["name"]
            if not verb in self.frames_for_verb:
                self.frames_for_verb[verb] = []
                self.classes[verb] = []
                
            self.frames_for_verb[verb] += frames
            self.classes[verb].append(vnclass)
            
        for subclass in xml_class.find("SUBCLASSES"):
            self._handle_class(subclass, frames, role_list, restrictions)
       
[docs]    def _build_frame(self, xml_frame, vnclass, role_list, restrictions):
        """Parse one frame
        
        :param xml_frame: XML representation of the frame.
        :type xml_frame: xml.etree.ElementTree.Element.
        :param vnclass: The VerbNet class to which the frame belongs.
        :type vnclass: str.
        
        """
        # Extract the structure
        base_structure = xml_frame.find("DESCRIPTION").attrib["primary"]
        # Transform it into a list
        #base_structure = [x.split(".")[0] for x in base_structure.split(" ")]
        base_structure = base_structure.split(" ")
        
        # Lexeme at the beginning of a structure are capitalized.
        # We need to them to be completely lowercase to match them against syntax item.
        element = base_structure[0]
        if element[0].isupper() and element.split(".")[0].upper() != element.split(".")[0]:
            base_structure[0] = element.lower()
            
        syntax_data = xml_frame.find("SYNTAX")
        
        roles, structures = self._build_structure(
            base_structure, syntax_data, vnclass, role_list)

        role_restr = [[restrictions[role_list.index(x)] for x in y] for y in roles]
        
        result = [VerbnetOfficialFrame(y, x, vnclass, role_restrictions=z)
            for x, y, z in zip(roles, structures, role_restr)]
        
        if self.normalize:
            example = xml_frame.find("EXAMPLES/EXAMPLE").text
            semantics = self._build_semantics(xml_frame.find("SEMANTICS"))
            syntax_roles = self._format_syntax_roles(xml_frame.find("SYNTAX"))
            
            for frame in result:
                frame.example = example
                frame.semantics = semantics
                frame.syntax = syntax_roles
        
        return result
  
[docs]    def _build_structure(self, base_structure, syntax_data, vnclass, role_list):
        """ Build the final structure from base_structure
        
        :param base_structure: The base structure
        :type base_structure: str List
        :param syntax_data: The XML "SYNTAX" node
        :type syntax_data: xml.etree.ElementTree.Element
        :param vnclass: The VerbNet class of the frame
        :type vnclass: str
        :returns: (str | str List) List -- the final structure
        
        """
        structure = []
        roles = []

        index_xml = -1
        num_slot = 0

        replacements = {
            "ADVP-Middle":[], "ADV-Middle":[],
            "NP-Fulfilling":["NP"], "NP-Dative":["NP"],
            "S-Quote":["S"], "S_INF":["to", "S"]
        }

        previous_was_pp = False

        for i, full_element in enumerate(base_structure):
            full_element = full_element.split(".")
            element = full_element[0]

            # see snooze-40.4 for instance (intransitive verbs)
            # We cannot use :replacements because lower/upper case
            # is used to detect keywords
            if element == "v": element = "V"

            # Handle "PP S_ING": we must ignore the PP
            if element == "S_ING" and previous_was_pp:
                del roles[-1]
                del structure[-1]
            previous_was_pp = (element == "PP")
            
            # Make "that" optionnal
            if element == "that": element = "(#that)"
            if element == "#that": element = "that"
            
            # Handle optionnal elements
            if len(element) > 0 and element[0] == "(":
                base_structure_1 = base_structure[:]
                del base_structure_1[i]
                base_structure_2 = base_structure[:]
                base_structure_2[i] = element[1:-1]

                roles1, structure1 = self._build_structure(
                    base_structure_1, syntax_data, vnclass, role_list)
                roles2, structure2 = self._build_structure(
                    base_structure_2, syntax_data, vnclass, role_list)
                return (roles1 + roles2), (structure1 + structure2)
            
            # Handle some syntax issues : see last entry of steal-10.5
            if element == "" or "\n" in element:
                continue
            # Handle simple replacements
            if element in replacements:
                structure = structure + replacements[element]
            # Handle the "a/b" syntax (which means "a" or "b")
            elif "/" in element:
                structure.append(set(element.split("/")))
            # Replace PP by "{preposition set} + NP"
            elif element == "PP":
                new_index, prep = self._read_syntax_data(
                    index_xml, syntax_data, "keyword", base_structure)
                if new_index == -1:
                    self.unhandled.append({
                        "file":self.filename,
                        "elem":"PP",
                        "data":"No syntax data found"
                    })
                    if len(full_element) > 1 and full_element[1] == "location":
                        structure += [verbnetprepclasses.prep["loc"], "NP"]
                    else:
                        structure += [verbnetprepclasses.all_preps, "NP"]
                else:
                    index_xml = new_index
                    structure += [prep, "NP"]
            # Everything else (NP, V, ...) is unmodified
            else:
                structure.append(element)

            search = element
            if search[0].islower(): search = "keyword"
            
            # Look for a matching element in SYNTAX
            # and check whether we can find an unexpected keyword to add,
            # between our current position and the matching element
            new_index, keyword = self._read_syntax_data(
                index_xml, syntax_data, search, base_structure)
            if keyword != "" and search != "keyword":
                structure.insert(-1, keyword)
            if new_index != -1:
                index_xml = new_index
            
            if VerbnetOfficialFrame._is_a_slot(element): roles.append(None)

            if len(full_element) > 1:
                potential_role = "-".join([x.title() for x in full_element[1].split('-')])
                if potential_role in role_list:
                    roles[num_slot - 1] = potential_role

        # Fill the role list
        i = 0
        for element in syntax_data:
            if ((not element.tag in ["VERB", "PREP", "LEX"]) and
                "value" in element.attrib
            ):
                if i >= len(roles):
                    roles.append(None)
                    self.unhandled.append({
                        "file":self.filename,
                        "elem":"\\",
                        "data":"Too many roles in the syntax"
                    })
                else:
                    if roles[i] != None and roles[i] != element.attrib["value"]:
                        self.unhandled.append({
                        "file":self.filename,
                        "elem":"\\",
                        "data":"Conflict between roles indicated in syntax and structure"
                        })
                    else:
                        roles[i] = element.attrib["value"]
                i += 1
             
        while len(roles) > 0 and roles[-1] == None: del roles[-1]
            
        return [roles], [structure]
    
[docs]    def _read_syntax_data(self, index_xml, syntax_data, elem, base_structure):
        """ Look for a node of SYNTAX that match the current element
        and tells whether a keyword was found between the old and new position
        
        :param index_xml: The current position
        :type index_ml: int
        :param syntax_data: The XML "SYNTAX" node
        :type syntax_data: xml.etree.ElementTree.Element
        :param elem: The element to look for (VerbNet syntax)
        :type elem: str
        :param base_structure: The frame base structure (for _handle_lex)
        :type base_structure: str List
        :returns: (int, str) -- the new position and a keyword if one is found
        
        """
        special_tags = {"V":["VERB"], "keyword":["PREP", "LEX"]}
        stop_tags = ["NP", "V"]
        
        expected_tags = ["NP"]
        if len(elem) >= 3 and elem[0:3] == "ADV": expected_tags = ["ADV"]
        if len(elem) >= 3 and elem[0:3] == "ADJ": expected_tags = ["ADJ", "NP"]
        if elem in special_tags: expected_tags = special_tags[elem]
        
        found = False
        keyword = ""
        index_xml += 1
        
        while index_xml < len(syntax_data):
            if syntax_data[index_xml].tag == "PREP":
                keyword = self._handle_prep(syntax_data[index_xml])
            if syntax_data[index_xml].tag == "LEX":
                keyword = self._handle_lex(syntax_data[index_xml], base_structure)
                
            if syntax_data[index_xml].tag in expected_tags:
                found = True
                break
            if syntax_data[index_xml].tag in stop_tags and elem != "V":
                break
            index_xml += 1
            
        if not found:
            return -1, ""
            
        return index_xml, keyword
        
[docs]    def _handle_lex(self, xml, base_structure):
        """Choose wether or not to keep a <LEX> entry
        
        :param xml: The <LEX> entry.
        :type xml:xml.etree.ElementTree.Element.
        :param base_structure: The VerbNet primary structure.
        :type base_structure: str List.
        :returns: String the lexeme value if accepted, "" otherwise

        """
        
        # The lexeme is already mentionned in the primary structure
        # We don't want to add it a second time
        if xml.attrib["value"] in base_structure:
            return ""
        
        #for group in verbnetprepclasses.keywords:
        if xml.attrib["value"] in verbnetprepclasses.keywords:
            return xml.attrib["value"]

        self.unhandled.append({
            "file":self.filename,
            "elem":"LEX",
            "data":"Unhandled lexeme : {}".format(xml.attrib["value"])
        })
        
        return ""
                            
[docs]    def _handle_prep(self, xml):
        """Generate the list of acceptable preposition from a <PREP> entry
        
        :param xml: The <PREP> entry.
        :type xml:xml.etree.ElementTree.Element.
        :returns: String List - the list of acceptable prepositions

        """
        for restr_group in xml:
            if restr_group.tag == "SELRESTRS":
                for restr in restr_group:
                    if (restr.attrib["Value"] == "+"
                        and restr.attrib["type"] in verbnetprepclasses.prep
                    ):
                        return verbnetprepclasses.prep[restr.attrib["type"]]
                    else:
                        self.unhandled.append({
                            "file":self.filename,
                            "elem":"PREP",
                            "data":"SELRESTR {}={}".format(
                                restr.attrib["type"], restr.attrib["Value"])
                        })
            else:
                self.unhandled.append({
                    "file":self.filename,
                    "elem":"PREP",
                    "data":"Unknown restriction : {}".format(restr_group.tag)
                })
        if "value" in xml.attrib:
            return set(xml.attrib["value"].split(" "))
        else:
            return ""
    
[docs]    def _format_syntax_roles(self, xml_syntax):
        result = []
        for node in xml_syntax:
            if node.tag == "NP":
                result.append(node.get("value"))
            elif node.tag == "VERB":
                result.append("V")
            elif node.tag == "LEX":
                result.append(node.get("value"))
            elif node.tag == "PREP":
                if node.get("value"):
                    result.append("{{{}}}".format(node.get("value")))
                else:
                    restr = node.find("SELRESTRS/SELRESTR")
                    result.append("{{{{{}{}}}}}".format(restr.get("Value"), restr.get("type")))

            if node.find("SYNRESTRS"):
                restr = node.find("SYNRESTRS/SYNRESTR")
                result.append("<{}{}>".format(restr.get("Value"), restr.get("type")))

        return " ".join(result)

[docs]    def _build_semantics(self, xml_semantics):
        pred_strings = []
        for pred in xml_semantics.findall("PRED"):
            pred_string = "{}({})".format(
                pred.get("value"),
                ", ".join([arg.get("value") for arg in pred.findall("ARGS/ARG")])
            )
            if pred.get("bool") == "!":
                pred_string = "not({})".format(pred_string)

            pred_strings.append(pred_string)

        return " ".join(pred_strings)
        
[docs]class VerbnetReaderTest(unittest.TestCase):

    """Unit test class"""
    
[docs]    def test_global(self):
        reader = VerbnetReader(paths.VERBNET_PATH)
        self.assertEqual(len(reader.frames_for_verb), 4154)

        test_verbs = ["sparkle", "employ", "break", "suggest", "snooze"]
        test_frames = [
            VerbnetOfficialFrame(
                ['there', 'V', 'NP', verbnetprepclasses.prep["loc"], 'NP'],
                ['Theme', 'Location'],
                "light_emission-43.1", []),
            VerbnetOfficialFrame(
                ["NP", "V", "NP", "ADV"],
                ["Agent", "Theme"],
                "use-105", []),
            VerbnetOfficialFrame(
                ["NP", "V"],
                ["Patient"],
                "break-45.1", []),
            VerbnetOfficialFrame(
                ["NP", "V", "how", "to", "S"],
                ["Agent", "Topic"],
                "say-37.7", []),
            VerbnetOfficialFrame(
                ["NP", "V"],
                ["Agent"],
                "snooze-40.4", [])
        ]
        restrictions_str = {
            "sparkle":["(NOT animate)", "NORESTR"],
            "employ":["(animate) OR (organization)", "NORESTR"],
            "break":["solid"],
            "suggest":["(animate) OR (organization)", "communication"],
            "snooze":["animate"]
        }
        
        for verb, frame in zip(test_verbs, test_frames):
            self.assertIn(verb, reader.frames_for_verb)
            self.assertIn(frame, reader.frames_for_verb[verb])
            vnframe = reader.frames_for_verb[verb][reader.frames_for_verb[verb].index(frame)]
            self.assertEqual(
                [str(x) for x in vnframe.role_restrictions], restrictions_str[verb])
        
        reader.frames_for_verb = {}
        root = ET.ElementTree(file=paths.VERBNET_PATH + "separate-23.1.xml")
        reader._handle_class(root.getroot(), [], [], [])
        
        list1 = [
            VerbnetOfficialFrame(
                ['NP', 'V', 'NP', {'from'}, 'NP'],
                ['Agent', 'Patient', 'Co-Patient'],
                "separate-23.1", []),
            VerbnetOfficialFrame(
                ['NP', 'V', 'NP'],
                ['Agent', 'Patient'],
                "separate-23.1", []),
            VerbnetOfficialFrame(
                ['NP', 'V'],
                ['Patient'],
                "separate-23.1", []),
            VerbnetOfficialFrame(
                ['NP', 'V', {'from'}, 'NP'],
                ['Patient', 'Co-Patient'],
                "separate-23.1", []),
            VerbnetOfficialFrame(
                ['NP', 'V'],
                ['Patient'],
                "separate-23.1", [])]
        list2 = [VerbnetOfficialFrame(['NP', 'V', {'from'}, 'NP'], ['Patient', 'Co-Patient'], "separate-23.1-1", [])]
        list3 = [VerbnetOfficialFrame(['NP', 'V', {'with'}, 'NP'], ['Patient', 'Co-Patient'], "separate-23.1-2", [])]
        expected_result = {
            'dissociate': list1+list3,
            'disconnect': list1+list3,
            'divide': list1+list2,
            'disassociate': list1,
            'disentangle': list1+list2,
            'divorce': list1+list2,
            'separate': list1+list3,
            'segregate': list1+list2,
            'part': list1+list3,
            'differentiate': list1+list2,
            'uncoil': list1,
            'decouple': list1+list2,
            'sever': list1,
            'dissimilate': list1+list2
        }
        
        for verb in expected_result:
            if expected_result[verb] != reader.frames_for_verb[verb]:
                print("Error :")
                print(verb)
                for data in expected_result[verb]:
                    print(data)
                print("\n")
                for data in reader.frames_for_verb[verb]:
                    print(data)
                print("\n")
            
        self.assertEqual(reader.frames_for_verb, expected_result)

[docs]def init_verbnet(path):
    print("Loading VerbNet data...")
    reader = VerbnetReader(path)
    errors["vn_parsing"] = reader.unhandled
    return reader.frames_for_verb, reader.classes