Commit 2e639f30 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Started implementing BERT based target extraction

parent ce393701
......@@ -3,5 +3,7 @@
__pycache__/
server/agent/amazon_data/
server/agent/target_extraction/data/
server/agent/target_extraction/stanford-corenlp-full-2018-10-05
.DS_Store
*.pickle
\ No newline at end of file
*.pickle
*.wv
\ No newline at end of file
......@@ -61,12 +61,14 @@ class Agent:
elif argument_sums[subf] < 0:
attackers[r].append(subf)
# calculate base scores for arguments
base_scores = {self.product_node: 0.5 + 0.5 * argument_sums[self.product_node] / review_count}
# calculate base scores for arguments: CHANGES TO INTERIM REPORT METHOD
base_strengths = {self.product_node: 0.5 + 0.5 * argument_sums[self.product_node] / review_count}
for feature in self.features:
base_scores[feature] = abs(argument_sums[feature]) / review_count
base_strengths[feature] = abs(argument_sums[feature]) / review_count
base_scores = {arg: 0.5 + 0.5 * argument_sums[arg] / review_count for arg in self.arguments}
qbaf = {'supporters': supporters, 'attackers': attackers, 'base_scores': base_scores}
qbaf = {'supporters': supporters, 'attackers': attackers,
'base_strengths': base_strengths, 'base_scores': base_scores}
return qbaf
@staticmethod
......@@ -86,8 +88,10 @@ class Agent:
return base_score
# apply DF-QUAD gradual semantics to qbaf
# CHANGES TO INTERIM REPORT METHOD
def get_strengths(self, qbaf):
strengths = {}
scores = {}
arguments = [node for node in PostOrderIter(self.product_node)]
for argument in arguments:
attacker_strengths = []
......@@ -97,9 +101,11 @@ class Agent:
attacker_strengths.append(strengths[child])
elif child in qbaf['supporters'][argument]:
supporter_strengths.append(strengths[child])
strengths[argument] = Agent.argument_strength(qbaf['base_scores'][argument], attacker_strengths,
strengths[argument] = Agent.argument_strength(qbaf['base_strengths'][argument], attacker_strengths,
supporter_strengths)
return strengths
scores[argument] = Agent.argument_strength(qbaf['base_scores'][argument], attacker_strengths,
supporter_strengths)
return strengths, scores
def analyze_reviews(self, csv):
reviews = [Review(row, self.product) for _, row in csv.iterrows()]
......@@ -113,17 +119,18 @@ class Agent:
# get qbaf from ra
self.qbaf = self.get_qbaf(ra, len(reviews))
# apply gradual semantics
self.strengths = self.get_strengths(self.qbaf)
self.strengths, self.scores = self.get_strengths(self.qbaf)
# print results
print('qbaf:')
print(self.qbaf)
print('strengths:')
print(self.strengths)
print('votes:')
for argument in self.arguments:
print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)),
print(argument.name)
print(' strengths:', self.strengths[argument])
print(' scores:', self.scores[argument])
print(' votes:')
print(' direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)),
len(self.attacking_phrases(argument))))
print(argument, 'augmented sum: {}'.format(self.vote_sum[argument]))
print(' augmented sum: {}'.format(self.vote_sum[argument]))
def get_strongest_supporting_subfeature(self, argument):
supporters = self.qbaf['supporters'][argument]
......@@ -140,8 +147,8 @@ class Agent:
return max(attacker_strengths, key=attacker_strengths.get)
def liked_argument(self, argument):
return self.vote_sum[argument] >= 0
# self.strengths[argument] > 0.5
return self.scores[argument] > 0.5
# self.vote_sum[argument] >= 0
# len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
def supported_argument(self, argument):
......
from agent.argumentquery import ArgumentQuery
from agent.agent import Agent
from agent.target_extraction.product import Product
import inflect
from nltk.stem import WordNetLemmatizer
from threading import Thread
......@@ -24,9 +22,6 @@ class Communicator:
ArgumentQuery(5, 'What did users say about the {arg} being poor?'),
]
inflect = inflect.engine()
wnl = WordNetLemmatizer()
def __init__(self, dl):
self.dl = dl
self.product = None
......@@ -65,10 +60,10 @@ class Communicator:
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
supp_name = self.product.argument_for_node(supp_node).name
text = 'The {} was highly rated because the {} {} good'.format(q_arg.name, supp_name,
self.was_were(supp_name))
self.was_were(supp_node))
if att_node:
att_name = self.product.argument_for_node(att_node).name
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_name))
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_node))
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
......@@ -79,10 +74,10 @@ class Communicator:
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
supp_name = self.product.argument_for_node(supp_node).name
text = 'The {} was considered to be good because the {} {} good'.format(q_arg.name, supp_name,
self.was_were(supp_name))
self.was_were(supp_node))
if att_node:
att_name = self.product.argument_for_node(att_node).name
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_name))
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_node))
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
......@@ -93,10 +88,10 @@ class Communicator:
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
att_name = self.product.argument_for_node(att_node).name
text = 'The {} was considered to be poor because the {} {} poor'.format(q_arg.name, att_name,
self.was_were(att_name))
self.was_were(att_node))
if supp_node:
supp_name = self.product.argument_for_node(supp_node).name
text += ', although the {} {} good.'.format(supp_name, self.was_were(supp_name))
text += ', although the {} {} good.'.format(supp_name, self.was_were(supp_node))
args = [q_arg_node, att_node, supp_node]
else:
text += '.'
......@@ -134,5 +129,5 @@ class Communicator:
return queries
def was_were(self, term):
return 'was' if self.wnl.lemmatize(term) == term else 'were'
def was_were(self, arg_n):
return 'was' if self.product.singularities[arg_n] else 'were'
annotators = tokenize,ssplit,pos,lemma,ner,parse,depparse,coref
outputFormat = serialized
serializer = edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer
......@@ -2,7 +2,7 @@ import pandas as pd
class DataLoader:
data_location = 'agent/amazon_data/amazon_reviews_us_pc.tsv'
data_location = 'agent/amazon_data/reviews_for_backpack.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id):
......
from rel_dataset import RelInstance
from relbertnet import RelBertNet
class BertExtractor:
pass
i = RelInstance('Testing if this works.')
net = RelBertNet()
net(i.to_tensor())
\ No newline at end of file
from transformers.file_utils import add_start_docstrings_to_callable
from transformers.modeling_bert import BertPreTrainedModel, BertEncoder, BertEmbeddings, BertPooler
from transformers import BertModel
import torch
BERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
`What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
`What are token type IDs? <../glossary.html#token-type-ids>`_
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
`What are position IDs? <../glossary.html#position-ids>`_
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
:obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask
is used in the cross-attention if the model is configured as a decoder.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
"""
class ModularBertModel(BertModel):
def __init__(self, config, k):
super(BertModel, self).__init__(config)
self.config = config
self.embeddings = BertEmbeddings(config)
self.encoder = ModularBertEncoder(config)
self.pooler = BertPooler(config)
self.init_weights()
self.n = config.num_hidden_layers
self.k = k
print(k)
print(ModularBertModel.__mro__)
print(self.pretrained_model_archive_map)
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
input_hidden_states=None
):
if input_ids is not None and input_hidden_states is not None:
raise ValueError("You cannot specify both input_ids and input_hidden_states at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
elif input_hidden_states is not None:
# not sure if this is correct
input_shape = input_hidden_states.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or input_hidden_states")
device = input_ids.device if input_ids is not None else input_hidden_states.device
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
attention_mask, input_shape, self.device
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
if input_ids:
embedding_output = self.embeddings(
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=None
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
layers=(0, self.n - self.k)
)
else:
encoder_outputs = self.encoder(
input_hidden_states,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
layers=(self.n - self.k, self.n)
)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions)
class ModularBertEncoder(BertEncoder):
def __init__(self, config):
super().__init__(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
layers=None
):
all_hidden_states = ()
all_attentions = ()
for i, layer_module in enumerate(self.layer[layers] if layers else self.layer):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(
hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
)
hidden_states = layer_outputs[0]
if self.output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# Add last layer
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,)
if self.output_attentions:
outputs = outputs + (all_attentions,)
return outputs # last-layer hidden state, (all hidden states), (all attentions)
from transformers import BertTokenizer
from relbertnet import TRAINED_WEIGHTS
MAX_SEQ_LEN = 128
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
class RelInstance:
def __init__(self, text):
self.text = text
def get(self):
tokens = tokenizer.tokenize(self.text)
return tokens
def to_tensor(self):
tokens = self.get()
encoded = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_SEQ_LEN,
is_pretokenized=True, return_tensors='pt')
return encoded
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import *
K = 4 # number of hidden layers in Bert2
HIDDEN_OUTPUT_FEATURES = 768
TRAINED_WEIGHTS = 'bert-base-uncased'
class RelBertNet(nn.Module):
def __init__(self):
super(RelBertNet, self).__init__()
# Load pretrained BERT weights
config = BertConfig.from_pretrained(TRAINED_WEIGHTS)
self.bert1 = BertModel.from_pretrained(TRAINED_WEIGHTS, config=config)
# Divide BERT encoder layers into two parts
self.bert2_layers = self.bert1.encoder.layer[-K:]
self.bert1.encoder.layer = self.bert1.encoder.layer[:-K]
self.n = config.num_hidden_layers
def forward(self, encoded_text):
# BERT1 with MASKall for context
bert_context_output, _ = self.bert1(**encoded_text)
# BERT2 with MASKall for NER
bert_ner_output = bert_context_output
for layer in self.bert2_layers:
bert_ner_output, = layer(bert_ner_output, attention_mask=None)
# without CLS token
bert_ner_output = bert_ner_output.narrow(1, 1, bert_ner_output.size()[1]-1)
print(bert_ner_output.size())
# CRF for NER
ner_output = None
# For each pair of named entities recognized, perform BERT2 with MASKrc for RC
# c = torch.combinations(ner_output)
# pairs = torch.cat((c, torch.flip(c, dims=(1,))), dim=0)
# bert_rc_output = bert_context_output
# for layer in self.bert2_layers:
# bert_rc_output, = layer(bert_rc_output, attention_mask=None)
# MLP for RC
rc_output = None
# Return NER and RC outputs
return ner_output, rc_output
......@@ -2,6 +2,9 @@ from anytree import Node
import pickle
from os.path import isfile
from agent.target_extraction.argument import Argument
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
class Product:
......@@ -16,6 +19,7 @@ class Product:
for a, syns in syn_dict.items() for a_node in self.argument_nodes if a_node.name == a}
self.arguments = {a_node: Argument(a_idx, a_node.name.replace('_', ' '))
for a_idx, a_node in enumerate(self.argument_nodes)}
self.singularities = {a_node: wnl.lemmatize(a_node.name) for a_node in self.argument_nodes}
def argument_node_for_id(self, id):
return self.argument_nodes[id]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment