Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
174b15e4
Commit
174b15e4
authored
Apr 15, 2020
by
Joel Oksanen
Browse files
Merge branch 'master' of
https://gitlab.doc.ic.ac.uk/jjo2317/individual_project
parents
6986b0d8
c7ab3edc
Changes
3
Hide whitespace changes
Inline
Side-by-side
ADA/SA/bert_analyzer.py
View file @
174b15e4
...
...
@@ -3,13 +3,15 @@ import torch.nn as nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
tdbertnet
import
TDBertNet
from
bert_dataset
import
BertDataset
,
polarity_indices
,
generate_batch
from
bert_dataset
import
BertDataset
,
Instance
,
polarity_indices
,
generate_batch
import
time
import
numpy
as
np
from
sklearn
import
metrics
import
matplotlib.pyplot
as
plt
semeval_2014_train_path
=
'data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path
=
'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path
=
'data/Amazon/amazon_camera_test.xml'
trained_model_path
=
'semeval_2014.pt'
BATCH_SIZE
=
32
...
...
@@ -45,7 +47,7 @@ class BertAnalyzer:
optimiser
.
zero_grad
()
# forward pass
outputs
=
self
.
net
(
texts
,
target_indices
)
outputs
,
_
=
self
.
net
(
texts
,
target_indices
)
# backward pass
l
=
loss
(
outputs
,
labels
)
...
...
@@ -74,7 +76,7 @@ class BertAnalyzer:
truths
=
[]
with
torch
.
no_grad
():
for
(
texts
,
target_indices
,
labels
)
in
test_loader
:
outputs
=
self
.
net
(
texts
,
target_indices
)
outputs
,
attentions
=
self
.
net
(
texts
,
target_indices
)
_
,
pred
=
torch
.
max
(
outputs
.
data
,
1
)
predicted
+=
pred
.
tolist
()
truths
+=
labels
.
tolist
()
...
...
@@ -90,7 +92,33 @@ class BertAnalyzer:
f1
=
metrics
.
f1_score
(
truths
,
predicted
,
labels
=
range
(
len
(
polarity_indices
)),
average
=
'macro'
)
print
(
'macro F1:'
,
f1
)
def
analyze_sentence
(
self
,
text
,
char_from
,
char_to
):
instance
=
Instance
(
text
,
char_from
,
char_to
)
tokens
,
tg_from
,
tg_to
=
instance
.
get
()
texts
,
target_indices
=
instance
.
to_tensor
()
with
torch
.
no_grad
():
outputs
,
attentions
=
self
.
net
(
texts
,
target_indices
)
target_attentions
=
torch
.
mean
(
attentions
,
1
)[
0
][
tg_from
+
1
:
tg_to
+
2
]
mean_target_att
=
torch
.
mean
(
target_attentions
,
0
)
# plot attention histogram
att_values
=
mean_target_att
.
numpy
()[
1
:
-
1
]
ax
=
plt
.
subplot
(
111
)
width
=
0.3
bins
=
[
x
-
width
/
2
for
x
in
range
(
1
,
len
(
att_values
)
+
1
)]
ax
.
bar
(
bins
,
att_values
,
width
=
width
)
ax
.
set_xticks
(
list
(
range
(
1
,
len
(
att_values
)
+
1
)))
ax
.
set_xticklabels
(
tokens
,
rotation
=
45
,
rotation_mode
=
'anchor'
,
ha
=
'right'
)
plt
.
show
()
_
,
pred
=
torch
.
max
(
outputs
.
data
,
1
)
return
pred
sentiment_analyzer
=
BertAnalyzer
()
sentiment_analyzer
.
load_saved
()
sentiment_analyzer
.
evaluate
(
semeval_2014_test_path
)
\ No newline at end of file
sentiment
=
sentiment_analyzer
.
analyze_sentence
(
'I will never buy another computer from HP/Compaq or do business with Circuit City again.'
,
39
,
48
)
print
(
'sentiment:'
,
sentiment
)
\ No newline at end of file
ADA/SA/bert_dataset.py
View file @
174b15e4
...
...
@@ -26,13 +26,11 @@ def generate_batch(batch):
def
token_for_char
(
char_idx
,
text
,
tokens
):
compressed_idx
=
len
(
re
.
sub
(
r
'\s+'
,
''
,
text
)[:
char_idx
+
1
])
-
1
compressed_idx
=
len
(
re
.
sub
(
r
'\s+'
,
''
,
text
[:
char_idx
+
1
]))
-
1
token_idx
=
-
1
while
compressed_idx
>=
0
:
token_idx
+=
1
compressed_idx
-=
len
(
tokens
[
token_idx
].
replace
(
'##'
,
''
))
return
token_idx
...
...
@@ -55,17 +53,36 @@ class BertDataset(Dataset):
char_from
=
int
(
term
.
attrib
[
'from'
])
char_to
=
int
(
term
.
attrib
[
'to'
])
-
1
polarity
=
term
.
attrib
[
'polarity'
]
self
.
data
.
append
((
text
,
char_from
,
char_to
,
polarity
))
self
.
data
.
append
((
Instance
(
text
,
char_from
,
char_to
)
,
polarity
))
def
__len__
(
self
):
return
len
(
self
.
data
)
def
__getitem__
(
self
,
idx
):
text
,
char_from
,
char_to
,
polarity_str
=
self
.
data
[
idx
]
instance
,
polarity_str
=
self
.
data
[
idx
]
tokens
=
tokenizer
.
tokenize
(
text
)
idx_from
=
token_for_char
(
char_from
,
text
,
tokens
)
idx_to
=
token_for_char
(
char_to
,
text
,
tokens
)
tokens
,
idx_from
,
idx_to
=
instance
.
get
()
polarity
=
polarity_index
(
polarity_str
)
return
{
'tokens'
:
tokens
,
'from'
:
idx_from
,
'to'
:
idx_to
,
'polarity'
:
polarity
}
class
Instance
:
def
__init__
(
self
,
text
,
char_from
,
char_to
):
self
.
text
=
text
self
.
char_from
=
char_from
self
.
char_to
=
char_to
def
get
(
self
):
tokens
=
tokenizer
.
tokenize
(
self
.
text
)
idx_from
=
token_for_char
(
self
.
char_from
,
self
.
text
,
tokens
)
idx_to
=
token_for_char
(
self
.
char_to
-
1
,
self
.
text
,
tokens
)
return
tokens
,
idx_from
,
idx_to
def
to_tensor
(
self
):
tokens
,
idx_from
,
idx_to
=
self
.
get
()
text
=
tokenizer
.
encode_plus
(
tokens
,
add_special_tokens
=
True
,
max_length
=
MAX_SEQ_LEN
,
is_pretokenized
=
True
,
return_tensors
=
'pt'
)
target_indices
=
torch
.
tensor
([[[
t
]
*
HIDDEN_OUTPUT_FEATURES
for
t
in
range
(
idx_from
,
idx_to
+
1
)]])
return
text
,
target_indices
ADA/SA/tdbertnet.py
View file @
174b15e4
...
...
@@ -6,19 +6,23 @@ from transformers import *
HIDDEN_OUTPUT_FEATURES
=
768
TRAINED_WEIGHTS
=
'bert-base-uncased'
class
TDBertNet
(
nn
.
Module
):
def
__init__
(
self
,
num_class
):
super
(
TDBertNet
,
self
).
__init__
()
self
.
bert_base
=
BertModel
.
from_pretrained
(
TRAINED_WEIGHTS
)
config
=
BertConfig
.
from_pretrained
(
TRAINED_WEIGHTS
,
output_attentions
=
True
)
self
.
bert_base
=
BertModel
.
from_pretrained
(
TRAINED_WEIGHTS
,
config
=
config
)
self
.
bert_base
.
config
.
output_attentions
=
True
self
.
fc
=
nn
.
Linear
(
HIDDEN_OUTPUT_FEATURES
,
num_class
)
# n of hidden features, n of output labels
def
forward
(
self
,
texts
,
target_indices
):
# BERT
bert_output
=
self
.
bert_base
(
**
texts
)
[
0
]
bert_output
,
_
,
attentions
=
self
.
bert_base
(
**
texts
)
# max pooling at target locations
target_outputs
=
torch
.
gather
(
bert_output
,
dim
=
1
,
index
=
target_indices
)
pooled_output
=
torch
.
max
(
target_outputs
,
dim
=
1
)[
0
]
# fc layer
x
=
self
.
fc
(
pooled_output
)
return
x
return
x
,
attentions
[
-
1
]
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment