#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Build syntactic trees from CoNLL parser output"""
import unittest
[docs]class ConllInvalidPositionError(Exception):
"""Trying to build a subtree from a node that does not exist
:var bad_root: integer, the position from which we attempted to build a subtree
:var max_root: integer, the last valid position
"""
def __init__(self, bad_root, max_root):
self.bad_root = bad_root
self.max_root = max_root
def __str__(self):
return "Error : tried to build a subtree from position {} while"\
" parsing CoNLL output (last valid position was {})".format(
self.bad_root, self.max_root)
# http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
[docs]def LongestCommonSubstring(S1, S2):
M = [[0]*(1+len(S2)) for i in range(1+len(S1))]
longest, x_longest = 0, 0
for x in range(1,1+len(S1)):
for y in range(1,1+len(S2)):
if S1[x-1] == S2[y-1]:
M[x][y] = M[x-1][y-1] + 1
if M[x][y]>longest:
longest = M[x][y]
x_longest = x
else:
M[x][y] = 0
return S1[x_longest-longest: x_longest]
[docs]class SyntacticTreeNode:
"""A node (internal or terminal) of a syntactic tree
:var word_id: int, the CoNLL word id (starts at 1)
:var word: string, the word contained by the node
:var pos: part-of-speech of the node
:var deprel: string, function attributed by the parser to this word
:var father: SyntacticTreeBuilder, the father of this node
:var children: SyntacticTreeNode list, the children of this node
:var begin: int, the character position this phrase starts at (root would be 0)
:var end: int, the position this phrase ends at (root would be last character)
:var begin_word: int, the position this *word* begins at
"""
def __init__(self, word_id, word, pos, deprel, begin_word):
self.word_id = word_id
self.word = word
self.pos = pos
self.deprel = deprel
self.father = None
self.children = []
self.begin_word = begin_word
self.begin, self.end = None, None
def __repr__(self):
if self.children:
children = " " + " ".join([str(t) for t in self.children])
else:
children = ""
return "({}/{}/{}/{}/{} {}{})".format(self.pos, self.deprel, self.position, self.begin, self.end, self.word, children)
def __iter__(self):
for position, child in enumerate(self.children):
if position == self.position:
yield(self)
for node in child:
yield node
if self.position == len(self.children):
yield self
[docs] def flat(self):
"""Return the tokenized sentence from the parse tree."""
return " ".join([x.word for x in self])
[docs] def contains(self, arg):
"""Search an exact argument in all subtrees"""
return (self.flat() == arg or
any((c.contains(arg) for c in self.children)))
[docs] def closest_match(self, arg):
"""Search the closest match to arg"""
return self.closest_match_as_node(arg).flat().split()
[docs] def closest_match_as_node(self, arg):
return self._closest_match_as_node_lcs(arg)[1]
[docs] def _closest_match_as_node_lcs(self, arg):
root_match = self.flat().split()
root_match_len = (len(LongestCommonSubstring(root_match, arg.split())) /
(len(root_match) + len(arg.split())))
children_results = [c._closest_match_as_node_lcs(arg) for c in self.children]
return max([(root_match_len, self)] + children_results, key = lambda x: x[0])
[docs]class SyntacticTreeBuilder():
"""Wrapper class for the building of a syntactic tree
:var node_dict: every SyntacticTreeNode available by CoNLL word id
:var father_ids: every dependency relation: child id -> father id
:var tree_list: every root node, that is every root subtree
:var sentence: the "sentence" (words separated by spaces)
"""
def __init__(self, conll_tree):
"""Extract the data provided
:param conll_tree: The output of the CoNLL parser
:type conll_tree: str
"""
self.node_dict, self.father_ids = {}, {}
self.tree_list = []
begin = 0
for l in conll_tree.splitlines():
word_id, form, lemma, cpos, pos, feat, head, deprel, *junk = l.split("\t")
word_id = int(word_id)
head = int(head) if head != '-' else None
deprel = deprel if deprel != '-' else 'ROOT'
self.father_ids[word_id] = head
self.node_dict[word_id] = SyntacticTreeNode(
word_id=word_id,
word=form, pos=cpos,
deprel=deprel,
begin_word=begin)
begin += 1 + len(form)
self.sentence = ' '.join([self.node_dict[word_id].word for word_id in sorted(self.node_dict.keys())])
# Record father/child relationship
for word_id, father_id in self.father_ids.items():
if father_id is not None and father_id != 0:
self.node_dict[word_id].father = self.node_dict[father_id]
self.node_dict[father_id].children.append(self.node_dict[word_id])
# Record position: where is father among child? Important to flatten tree
for father in self.node_dict.values():
father.position = 0
for child_id, child in enumerate(father.children):
if child.begin_word > father.begin_word:
father.position = child_id
break
father.position = len(father.children)
for node in self.node_dict.values():
if node.father is None:
# Fill begin/end info
#import pudb; pu.db
self.fill_begin_end(node)
# Fill forest of tree
self.tree_list.append(node)
[docs] def fill_begin_end(self, node):
"""Fill begin/end values of very subtree"""
begin_words = [node.begin_word]
end_words = [node.begin_word + len(node.word) - 1]
for child in node.children:
self.fill_begin_end(child)
begin_words.append(child.begin)
end_words.append(child.end)
node.begin = min(begin_words)
node.end = max(end_words)
[docs]class ConllSemanticAppender():
"""Appends semantic information at the "right" of a ConLL file.
The input is a syntactic ConLL file, and the output a so-called semantic
CoNLL file.
"""
def __init__(self, syntactic_conll_file):
self.conll_matrix = []
with open(syntactic_conll_file) as content:
sentences_data = content.read().split("\n\n")
for sentence in sentences_data:
sentence_matrix = []
for line in sentence.split('\n'):
if len(line.split('\t')) == 1:
continue
# Put current line plus a line for potential frame annotations
sentence_matrix.append(line.split('\t') + ['-'])
self.conll_matrix.append(sentence_matrix)
[docs] def add_new_column(self, sentence_id):
for line in self.conll_matrix[sentence_id]:
line.append('-')
[docs] def add_frame_annotation(self, frame_annotation):
# We could have multiple classes, so join them with |
self.conll_matrix[frame_annotation.sentence_id][frame_annotation.predicate_position-1][10] = '|'.join(frame_annotation.best_classes)
# Add new column to place the new roles
self.add_new_column(frame_annotation.sentence_id)
for roleset, arg in zip(frame_annotation.roles, frame_annotation.args):
roleset_str = '|'.join(roleset) if roleset else '_EMPTYROLE_'
self.conll_matrix[frame_annotation.sentence_id][arg.position-1][-1] = roleset_str
[docs] def dump_semantic_file(self, filename):
with open(filename, 'w') as semantic_file:
for i, sentence in enumerate(self.conll_matrix):
for line in sentence:
print('\t'.join(line), file=semantic_file)
if i < len(self.conll_matrix) - 1:
print(end='\n', file=semantic_file)
[docs]class TreeBuilderTest(unittest.TestCase):
[docs] def setUp(self):
conll_tree = \
"""1 The The DT DT - 2 NMOD - -
2 others others NNS NNS - 5 SBJ - -
3 here here RB RB - 2 LOC - -
4 today today RB RB - 3 TMP - -
5 live live VV VV - 0 ROOT - -
6 elsewhere elsewhere RB RB - 5 LOC - -
7 . . . . - 5 P - -"""
treeBuilder = SyntacticTreeBuilder(conll_tree)
self.tree_list = treeBuilder.tree_list
[docs] def test_none_fathers(self):
for tree in self.tree_list:
self.assertEqual(tree.father, None)
[docs] def test_tree_str(self):
#The others here today live elsewhere .
expected_str = "(VV/ROOT/1/0/37 live (NNS/SBJ/1/0/20 others (DT/NMOD/0/0/2 The) (RB/LOC/0/11/20 here (RB/TMP/0/16/20 today))) (RB/LOC/0/27/35 elsewhere) (./P/0/37/37 .))"
self.assertEqual(str(self.tree_list[0]), expected_str)
[docs] def test_tree_flat(self):
self.assertEqual(self.tree_list[0].flat(), "The others here today live elsewhere .")
[docs] def test_tree_contains(self):
self.assertTrue(self.tree_list[0].contains("here today"))
self.assertFalse(self.tree_list[0].contains("others here today"))
[docs] def test_tree_match(self):
self.assertEqual(self.tree_list[0].closest_match("others here today"),
['The', 'others', 'here', 'today'])
[docs] def test_lima_tree(self):
conll_tree = \
"""1 Jamaica Jamaica NNP NNP - 2 SUB - -
2 is be VBZ VBZ - - - - -
3 not not NOT NOT - 2 VMOD - -
4 just just RB RB - - - - -
5 a a DT DT - 6 NMOD - -
6 destination destination NN NN - 2 OBJ - -
7 it it PRP PRP - 8 SUB - -
8 is be VBZ VBZ - 6 NMOD - -
9 an a DT DT - 10 NMOD - -
10 experience experience NN NN - 8 OBJ - -"""
treeBuilder = SyntacticTreeBuilder(conll_tree)
for tree in treeBuilder.tree_list:
self.assertEqual(tree.father, None)
print(str(tree))
[docs] def test_another_flat(self):
conll_tree = \
"""1 a a DT DT - 3 NMOD - -
2 few few JJ JJ - 3 NMOD - -
3 months months NNS NNS - 4 AMOD - -
4 ago ago IN IN - 6 VMOD - -
5 you you PRP PRP - 6 SUB - -
6 received received VBD VBD - 0 ROOT - -
7 a a DT DT - 8 NMOD - -
8 letter letter NN NN - 6 OBJ - -"""
self.assertEqual(SyntacticTreeBuilder(conll_tree).tree_list[0].flat(), 'a few months ago you received a letter')