Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Joel Oksanen
individual_project
Commits
b84c4eb6
Commit
b84c4eb6
authored
Jul 21, 2020
by
Joel Oksanen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Experimentation with further improving feature and relation extraction
parent
71a5c117
Changes
10
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
934 additions
and
215 deletions
+934
-215
ADA-X/.gitignore
ADA-X/.gitignore
+1
-0
ADA-X/server/agent/target_extraction/BERT/entity_extractor/bert_entity_extractor.py
...extraction/BERT/entity_extractor/bert_entity_extractor.py
+8
-6
ADA-X/server/agent/target_extraction/BERT/entity_extractor/entity_dataset.py
...target_extraction/BERT/entity_extractor/entity_dataset.py
+51
-107
ADA-X/server/agent/target_extraction/BERT/entity_extractor/entitybertnet.py
.../target_extraction/BERT/entity_extractor/entitybertnet.py
+2
-6
ADA-X/server/agent/target_extraction/BERT/relation_extractor/bert_rel_extractor.py
..._extraction/BERT/relation_extractor/bert_rel_extractor.py
+61
-30
ADA-X/server/agent/target_extraction/BERT/relation_extractor/pairbertnet.py
.../target_extraction/BERT/relation_extractor/pairbertnet.py
+2
-13
ADA-X/server/agent/target_extraction/BERT/relation_extractor/rel_dataset.py
.../target_extraction/BERT/relation_extractor/rel_dataset.py
+178
-0
ADA-X/server/agent/target_extraction/BERT/relation_extractor/relbertnet.py
...t/target_extraction/BERT/relation_extractor/relbertnet.py
+53
-0
ADA-X/server/agent/target_extraction/entity_annotation.py
ADA-X/server/agent/target_extraction/entity_annotation.py
+267
-13
ADA-X/server/agent/target_extraction/target_extractor.py
ADA-X/server/agent/target_extraction/target_extractor.py
+311
-40
No files found.
ADA-X/.gitignore
View file @
b84c4eb6
...
@@ -5,6 +5,7 @@ server/agent/amazon_data/
...
@@ -5,6 +5,7 @@ server/agent/amazon_data/
server/agent/SA/data/
server/agent/SA/data/
server/agent/target_extraction/data/
server/agent/target_extraction/data/
server/agent/target_extraction/BERT/data/
server/agent/target_extraction/BERT/data/
server/agent/target_extraction/eval/qa/
.DS_Store
.DS_Store
*.pickle
*.pickle
*.wv
*.wv
\ No newline at end of file
ADA-X/server/agent/target_extraction/BERT/entity_extractor/bert_entity_extractor.py
View file @
b84c4eb6
...
@@ -10,7 +10,7 @@ from sklearn import metrics
...
@@ -10,7 +10,7 @@ from sklearn import metrics
import
statistics
import
statistics
from
transformers
import
get_linear_schedule_with_warmup
from
transformers
import
get_linear_schedule_with_warmup
from
agent.target_extraction.BERT.entity_extractor.entity_dataset
import
EntityDataset
,
generate_batch
,
generate_production_batch
from
agent.target_extraction.BERT.entity_extractor.entity_dataset
import
EntityDataset
,
generate_batch
,
generate_production_batch
from
agent.target_extraction.BERT.entity_extractor.entitybertnet
import
NUM_CLASSES
,
EntityBertNet
from
agent.target_extraction.BERT.entity_extractor.entitybertnet
import
NUM_CLASSES
,
EntityBertNet
,
BATCH_SIZE
device
=
torch
.
device
(
'cuda'
)
device
=
torch
.
device
(
'cuda'
)
...
@@ -21,7 +21,6 @@ MAX_GRAD_NORM = 1.0
...
@@ -21,7 +21,6 @@ MAX_GRAD_NORM = 1.0
# training
# training
N_EPOCHS
=
3
N_EPOCHS
=
3
BATCH_SIZE
=
32
WARM_UP_FRAC
=
0.05
WARM_UP_FRAC
=
0.05
# loss
# loss
...
@@ -61,8 +60,7 @@ class BertEntityExtractor:
...
@@ -61,8 +60,7 @@ class BertEntityExtractor:
else
:
else
:
train_size
=
int
(
size
*
(
1
-
valid_frac
))
if
size
is
not
None
else
None
train_size
=
int
(
size
*
(
1
-
valid_frac
))
if
size
is
not
None
else
None
train_data
,
_
=
EntityDataset
.
from_file
(
file_path
,
size
=
train_size
)
train_data
,
_
=
EntityDataset
.
from_file
(
file_path
,
size
=
train_size
)
valid_size
=
int
(
size
*
valid_frac
)
if
size
is
not
None
else
int
(
len
(
train_data
)
*
valid_frac
)
valid_data
,
_
=
EntityDataset
.
from_file
(
valid_file_path
)
valid_data
,
_
=
EntityDataset
.
from_file
(
valid_file_path
,
size
=
valid_size
)
train_loader
=
DataLoader
(
train_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
num_workers
=
4
,
train_loader
=
DataLoader
(
train_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
num_workers
=
4
,
collate_fn
=
generate_batch
)
collate_fn
=
generate_batch
)
...
@@ -119,11 +117,11 @@ class BertEntityExtractor:
...
@@ -119,11 +117,11 @@ class BertEntityExtractor:
print
(
'epoch done'
)
print
(
'epoch done'
)
torch
.
save
(
self
.
net
.
state_dict
(),
'{}_epoch_{}.pt'
.
format
(
save_file
,
epoch_idx
+
1
))
if
valid_data
is
not
None
:
if
valid_data
is
not
None
:
self
.
evaluate
(
data
=
valid_data
)
self
.
evaluate
(
data
=
valid_data
)
torch
.
save
(
self
.
net
.
state_dict
(),
'{}.pt'
.
format
(
save_file
))
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'Training took'
,
end
-
start
,
'seconds'
)
print
(
'Training took'
,
end
-
start
,
'seconds'
)
...
@@ -207,3 +205,7 @@ class BertEntityExtractor:
...
@@ -207,3 +205,7 @@ class BertEntityExtractor:
probs
[
ins
.
entity
].
append
(
score
)
probs
[
ins
.
entity
].
append
(
score
)
return
{
t
:
statistics
.
mean
(
t_probs
)
if
len
(
t_probs
)
>
0
else
None
for
t
,
t_probs
in
probs
.
items
()}
return
{
t
:
statistics
.
mean
(
t_probs
)
if
len
(
t_probs
)
>
0
else
None
for
t
,
t_probs
in
probs
.
items
()}
BertEntityExtractor
.
train_and_validate
(
'all_reviews_features.tsv'
,
'feature_extractor'
,
valid_file_path
=
'annotated_watch_review_features.tsv'
)
ADA-X/server/agent/target_extraction/BERT/entity_extractor/entity_dataset.py
View file @
b84c4eb6
...
@@ -8,58 +8,22 @@ import os.path
...
@@ -8,58 +8,22 @@ import os.path
from
agent.target_extraction.BERT.relation_extractor.pairbertnet
import
TRAINED_WEIGHTS
,
HIDDEN_OUTPUT_FEATURES
from
agent.target_extraction.BERT.relation_extractor.pairbertnet
import
TRAINED_WEIGHTS
,
HIDDEN_OUTPUT_FEATURES
MAX_SEQ_LEN
=
128
MAX_SEQ_LEN
=
128
LABELS
=
[
'ASPECT'
,
'NAN'
]
LABEL_MAP
=
{
'ASPECT'
:
1
,
'NAN'
:
0
,
None
:
None
}
MASK_TOKEN
=
'[MASK]'
MASK_TOKEN
=
'[MASK]'
tokenizer
=
BertTokenizer
.
from_pretrained
(
TRAINED_WEIGHTS
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
TRAINED_WEIGHTS
)
def
generate_batch
(
batch
):
encoded
=
tokenizer
.
batch_encode_plus
([
instance
.
tokens
for
instance
in
batch
],
add_special_tokens
=
True
,
max_length
=
MAX_SEQ_LEN
,
pad_to_max_length
=
True
,
is_pretokenized
=
True
,
return_tensors
=
'pt'
)
input_ids
=
encoded
[
'input_ids'
]
attn_mask
=
encoded
[
'attention_mask'
]
labels
=
torch
.
tensor
([
instance
.
label
for
instance
in
batch
])
entity_indices
=
indices_for_entity_ranges
([
instance
.
entity_range
for
instance
in
batch
])
return
input_ids
,
attn_mask
,
entity_indices
,
labels
def
generate_production_batch
(
batch
):
encoded
=
tokenizer
.
batch_encode_plus
([
instance
.
tokens
for
instance
in
batch
],
add_special_tokens
=
True
,
max_length
=
MAX_SEQ_LEN
,
pad_to_max_length
=
True
,
is_pretokenized
=
True
,
return_tensors
=
'pt'
)
input_ids
=
encoded
[
'input_ids'
]
attn_mask
=
encoded
[
'attention_mask'
]
entity_indices
=
indices_for_entity_ranges
([
instance
.
entity_range
for
instance
in
batch
])
return
input_ids
,
attn_mask
,
entity_indices
,
batch
def
indices_for_entity_ranges
(
ranges
):
max_e_len
=
max
(
end
-
start
for
start
,
end
in
ranges
)
indices
=
torch
.
tensor
([[[
min
(
t
,
end
)]
*
HIDDEN_OUTPUT_FEATURES
for
t
in
range
(
start
,
start
+
max_e_len
+
1
)]
for
start
,
end
in
ranges
])
return
indices
class
EntityDataset
(
Dataset
):
class
EntityDataset
(
Dataset
):
def
__init__
(
self
,
df
,
size
=
None
):
def
__init__
(
self
,
df
,
training
=
True
,
size
=
None
):
# filter inapplicable rows
self
.
df
=
df
self
.
df
=
df
[
df
.
apply
(
lambda
x
:
EntityDataset
.
instance_from_row
(
x
)
is
not
None
,
axis
=
1
)]
self
.
training
=
training
# sample data if a size is specified
# sample data if a size is specified
if
size
is
not
None
and
size
<
len
(
self
):
if
size
is
not
None
and
size
<
len
(
self
):
self
.
df
=
self
.
df
.
sample
(
size
,
replace
=
False
)
self
.
df
=
self
.
df
.
sample
(
size
,
replace
=
False
)
@
staticmethod
@
staticmethod
def
f
rom_df
(
df
,
size
=
None
):
def
f
or_extraction
(
df
):
dataset
=
EntityDataset
(
df
,
size
=
siz
e
)
dataset
=
EntityDataset
(
df
,
training
=
Fals
e
)
print
(
'Obtained dataset of size'
,
len
(
dataset
))
print
(
'Obtained dataset of size'
,
len
(
dataset
))
return
dataset
return
dataset
...
@@ -83,80 +47,60 @@ class EntityDataset(Dataset):
...
@@ -83,80 +47,60 @@ class EntityDataset(Dataset):
print
(
'Obtained train set of size'
,
len
(
dataset
),
'and validation set of size'
,
len
(
validset
))
print
(
'Obtained train set of size'
,
len
(
dataset
),
'and validation set of size'
,
len
(
validset
))
return
dataset
,
validset
return
dataset
,
validset
@
staticmethod
def
instance_from_row
(
self
,
row
):
def
instance_from_row
(
row
):
if
self
.
training
:
unpacked_arr
=
literal_eval
(
row
[
'entityMentions'
])
if
type
(
row
[
'entityMentions'
])
is
str
else
row
[
'entityMentions'
]
return
EntityInstance
(
literal_eval
(
row
[
'tokens'
]),
rms
=
[
rm
for
rm
in
unpacked_arr
if
'label'
not
in
rm
or
rm
[
'label'
]
in
LABELS
]
row
[
'entity_idx'
],
if
len
(
rms
)
==
1
:
label
=
row
[
'label'
])
entity
,
label
=
rms
[
0
][
'text'
],
(
rms
[
0
][
'label'
]
if
'label'
in
rms
[
0
]
else
None
)
else
:
return
None
# raise AttributeError('Instances must have exactly one relation')
text
=
row
[
'sentText'
]
return
EntityDataset
.
get_instance
(
text
,
entity
,
label
=
label
)
@
staticmethod
def
get_instance
(
text
,
entity
,
label
=
None
):
tokens
=
tokenizer
.
tokenize
(
text
)
i
=
0
found_entity
=
False
entity_range
=
None
while
i
<
len
(
tokens
):
match_length
=
EntityDataset
.
token_entity_match
(
i
,
entity
.
lower
(),
tokens
)
if
match_length
is
not
None
:
if
found_entity
:
return
None
# raise AttributeError('Entity {} appears twice in text {}'.format(entity, text))
found_entity
=
True
tokens
[
i
:
i
+
match_length
]
=
[
MASK_TOKEN
]
*
match_length
entity_range
=
(
i
+
1
,
i
+
match_length
)
# + 1 taking into account the [CLS] token
i
+=
match_length
else
:
i
+=
1
if
found_entity
:
return
PairRelInstance
(
tokens
,
entity
,
entity_range
,
LABEL_MAP
[
label
],
text
)
else
:
else
:
return
None
return
EntityInstance
(
row
[
'tokens'
],
row
[
'entity_idx'
],
@
staticmethod
entity
=
row
[
'entity'
])
def
token_entity_match
(
first_token_idx
,
entity
,
tokens
):
token_idx
=
first_token_idx
remaining_entity
=
entity
while
remaining_entity
:
if
remaining_entity
==
entity
or
remaining_entity
.
lstrip
()
!=
remaining_entity
:
# start of new word
remaining_entity
=
remaining_entity
.
lstrip
()
if
token_idx
<
len
(
tokens
)
and
tokens
[
token_idx
]
==
remaining_entity
[:
len
(
tokens
[
token_idx
])]:
remaining_entity
=
remaining_entity
[
len
(
tokens
[
token_idx
]):]
token_idx
+=
1
else
:
break
else
:
# continuing same word
if
(
token_idx
<
len
(
tokens
)
and
tokens
[
token_idx
].
startswith
(
'##'
)
and
tokens
[
token_idx
][
2
:]
==
remaining_entity
[:
len
(
tokens
[
token_idx
][
2
:])]):
remaining_entity
=
remaining_entity
[
len
(
tokens
[
token_idx
][
2
:]):]
token_idx
+=
1
else
:
break
if
remaining_entity
:
return
None
else
:
return
token_idx
-
first_token_idx
def
__len__
(
self
):
def
__len__
(
self
):
return
len
(
self
.
df
.
index
)
return
len
(
self
.
df
.
index
)
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
return
EntityDataset
.
instance_from_row
(
self
.
df
.
iloc
[
idx
])
return
self
.
instance_from_row
(
self
.
df
.
iloc
[
idx
])
class
PairRel
Instance
:
class
Entity
Instance
:
def
__init__
(
self
,
tokens
,
entity
,
entity_range
,
label
,
text
):
def
__init__
(
self
,
tokens
,
entity
_idx
,
label
=
None
,
entity
=
None
):
self
.
tokens
=
tokens
self
.
tokens
=
tokens
self
.
entity
=
entity
self
.
entity_idx
=
entity_idx
self
.
entity_range
=
entity_range
self
.
label
=
label
self
.
label
=
label
self
.
text
=
text
self
.
entity
=
entity
def
generate_batch
(
instances
:
[
EntityInstance
]):
encoded
=
tokenizer
.
batch_encode_plus
([
instance
.
tokens
for
instance
in
instances
],
add_special_tokens
=
True
,
max_length
=
MAX_SEQ_LEN
,
pad_to_max_length
=
True
,
is_pretokenized
=
True
,
return_tensors
=
'pt'
)
input_ids
=
encoded
[
'input_ids'
]
attn_mask
=
encoded
[
'attention_mask'
]
entity_indices
=
torch
.
tensor
([
instance
.
entity_idx
for
instance
in
instances
])
labels
=
torch
.
tensor
([
instance
.
label
for
instance
in
instances
])
return
input_ids
,
attn_mask
,
entity_indices
,
labels
def
generate_production_batch
(
instances
:
[
EntityInstance
]):
encoded
=
tokenizer
.
batch_encode_plus
([
instance
.
tokens
for
instance
in
instances
],
add_special_tokens
=
True
,
max_length
=
MAX_SEQ_LEN
,
pad_to_max_length
=
True
,
is_pretokenized
=
True
,
return_tensors
=
'pt'
)
input_ids
=
encoded
[
'input_ids'
]
attn_mask
=
encoded
[
'attention_mask'
]
entity_indices
=
torch
.
tensor
([
instance
.
entity_idx
for
instance
in
instances
])
return
input_ids
,
attn_mask
,
entity_indices
,
instances
# def indices_for_entity_ranges(ranges):
# max_e_len = max(end - start for start, end in ranges)
# indices = torch.tensor([[[min(t, end)] * HIDDEN_OUTPUT_FEATURES
# for t in range(start, start + max_e_len + 1)]
# for start, end in ranges])
# return indices
ADA-X/server/agent/target_extraction/BERT/entity_extractor/entitybertnet.py
View file @
b84c4eb6
...
@@ -5,6 +5,7 @@ from transformers import *
...
@@ -5,6 +5,7 @@ from transformers import *
HIDDEN_OUTPUT_FEATURES
=
768
HIDDEN_OUTPUT_FEATURES
=
768
TRAINED_WEIGHTS
=
'bert-base-uncased'
TRAINED_WEIGHTS
=
'bert-base-uncased'
NUM_CLASSES
=
2
# entity, not entity
NUM_CLASSES
=
2
# entity, not entity
BATCH_SIZE
=
32
class
EntityBertNet
(
nn
.
Module
):
class
EntityBertNet
(
nn
.
Module
):
...
@@ -20,14 +21,9 @@ class EntityBertNet(nn.Module):
...
@@ -20,14 +21,9 @@ class EntityBertNet(nn.Module):
bert_output
,
_
=
self
.
bert_base
(
input_ids
=
input_ids
,
attention_mask
=
attn_mask
)
bert_output
,
_
=
self
.
bert_base
(
input_ids
=
input_ids
,
attention_mask
=
attn_mask
)
# max pooling at entity locations
# max pooling at entity locations
entity_pooled_output
=
EntityBertNet
.
pooled_output
(
bert_output
,
entity_indices
)
entity_pooled_output
=
bert_output
[
torch
.
arange
(
0
,
bert_output
.
shape
[
0
])
,
entity_indices
]
# fc layer (softmax activation done in loss function)
# fc layer (softmax activation done in loss function)
x
=
self
.
fc
(
entity_pooled_output
)
x
=
self
.
fc
(
entity_pooled_output
)
return
x
return
x
@
staticmethod
def
pooled_output
(
bert_output
,
indices
):
outputs
=
torch
.
gather
(
bert_output
,
dim
=
1
,
index
=
indices
)
pooled_output
,
_
=
torch
.
max
(
outputs
,
dim
=
1
)
return
pooled_output
ADA-X/server/agent/target_extraction/BERT/relation_extractor/bert_rel_extractor.py
View file @
b84c4eb6
...
@@ -8,8 +8,10 @@ import time
...
@@ -8,8 +8,10 @@ import time
import
numpy
as
np
import
numpy
as
np
from
sklearn
import
metrics
from
sklearn
import
metrics
from
transformers
import
get_linear_schedule_with_warmup
from
transformers
import
get_linear_schedule_with_warmup
from
agent.target_extraction.BERT.relation_extractor.pair_rel_dataset
import
PairRelDataset
,
generate_batch
,
generate_production_batch
# from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import PairRelDataset, generate_batch, generate_production_batch
from
agent.target_extraction.BERT.relation_extractor.pairbertnet
import
NUM_CLASSES
,
PairBertNet
from
agent.target_extraction.BERT.relation_extractor.rel_dataset
import
PairRelDataset
,
generate_batch
,
generate_production_batch
,
RelInstance
# from agent.target_extraction.BERT.relation_extractor.pairbertnet import NUM_CLASSES, PairBertNet
from
agent.target_extraction.BERT.relation_extractor.relbertnet
import
NUM_CLASSES
,
RelBertNet
device
=
torch
.
device
(
'cuda'
)
device
=
torch
.
device
(
'cuda'
)
...
@@ -30,12 +32,12 @@ loss_criterion = CrossEntropyLoss()
...
@@ -30,12 +32,12 @@ loss_criterion = CrossEntropyLoss()
class
BertRelExtractor
:
class
BertRelExtractor
:
def
__init__
(
self
):
def
__init__
(
self
):
self
.
net
=
Pair
BertNet
()
self
.
net
=
Rel
BertNet
()
@
staticmethod
@
staticmethod
def
load_saved
(
path
):
def
load_saved
(
path
):
extr
=
BertRelExtractor
()
extr
=
BertRelExtractor
()
extr
.
net
=
Pair
BertNet
()
extr
.
net
=
Rel
BertNet
()
extr
.
net
.
load_state_dict
(
torch
.
load
(
path
))
extr
.
net
.
load_state_dict
(
torch
.
load
(
path
))
extr
.
net
.
eval
()
extr
.
net
.
eval
()
return
extr
return
extr
...
@@ -60,8 +62,7 @@ class BertRelExtractor:
...
@@ -60,8 +62,7 @@ class BertRelExtractor:
else
:
else
:
train_size
=
int
(
size
*
(
1
-
valid_frac
))
if
size
is
not
None
else
None
train_size
=
int
(
size
*
(
1
-
valid_frac
))
if
size
is
not
None
else
None
train_data
,
_
=
PairRelDataset
.
from_file
(
file_path
,
size
=
train_size
)
train_data
,
_
=
PairRelDataset
.
from_file
(
file_path
,
size
=
train_size
)
valid_size
=
int
(
size
*
valid_frac
)
if
size
is
not
None
else
int
(
len
(
train_data
)
*
valid_frac
)
valid_data
,
_
=
PairRelDataset
.
from_file
(
valid_file_path
)
valid_data
,
_
=
PairRelDataset
.
from_file
(
valid_file_path
,
size
=
valid_size
)
train_loader
=
DataLoader
(
train_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
num_workers
=
4
,
train_loader
=
DataLoader
(
train_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
num_workers
=
4
,
collate_fn
=
generate_batch
)
collate_fn
=
generate_batch
)
...
@@ -87,16 +88,16 @@ class BertRelExtractor:
...
@@ -87,16 +88,16 @@ class BertRelExtractor:
for
batch_idx
,
batch
in
enumerate
(
train_loader
):
for
batch_idx
,
batch
in
enumerate
(
train_loader
):
# send batch to gpu
# send batch to gpu
input_ids
,
attn_mask
,
masked_indices
,
fst_indices
,
snd_indices
,
target_
labels
=
tuple
(
i
.
to
(
device
)
for
i
in
batch
)
input_ids
,
attn_mask
,
entity_indices
,
entity_mask
,
labels
=
tuple
(
i
.
to
(
device
)
for
i
in
batch
)
# zero param gradients
# zero param gradients
optimiser
.
zero_grad
()
optimiser
.
zero_grad
()
# forward pass
# forward pass
output_scores
=
self
.
net
(
input_ids
,
attn_mask
,
masked_indices
,
fst_indices
,
snd_indices
)
output_scores
=
self
.
net
(
input_ids
,
attn_mask
,
entity_indices
,
entity_mask
)
# backward pass
# backward pass
loss
=
loss_criterion
(
output_scores
,
target_
labels
)
loss
=
loss_criterion
(
output_scores
,
labels
)
loss
.
backward
()
loss
.
backward
()
# clip gradient norm
# clip gradient norm
...
@@ -117,12 +118,11 @@ class BertRelExtractor:
...
@@ -117,12 +118,11 @@ class BertRelExtractor:
batch_loss
=
0.0
batch_loss
=
0.0
print
(
'epoch done'
)
print
(
'epoch done'
)
torch
.
save
(
self
.
net
.
state_dict
(),
'{}_epoch_{}.pt'
.
format
(
save_file
,
epoch_idx
+
1
))
if
valid_data
is
not
None
:
if
valid_data
is
not
None
:
self
.
evaluate
(
data
=
valid_data
)
self
.
evaluate
(
data
=
valid_data
)
torch
.
save
(
self
.
net
.
state_dict
(),
'{}.pt'
.
format
(
save_file
))
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'Training took'
,
end
-
start
,
'seconds'
)
print
(
'Training took'
,
end
-
start
,
'seconds'
)
...
@@ -147,15 +147,14 @@ class BertRelExtractor:
...
@@ -147,15 +147,14 @@ class BertRelExtractor:
with
torch
.
no_grad
():
with
torch
.
no_grad
():
for
batch
in
test_loader
:
for
batch
in
test_loader
:
# send batch to gpu
# send batch to gpu
input_ids
,
attn_mask
,
masked_indices
,
fst_indices
,
snd_indices
,
target_labels
=
tuple
(
i
.
to
(
device
)
input_ids
,
attn_mask
,
entity_indices
,
entity_mask
,
labels
=
tuple
(
i
.
to
(
device
)
for
i
in
batch
)
for
i
in
batch
)
# forward pass
# forward pass
output_scores
=
self
.
net
(
input_ids
,
attn_mask
,
masked_indices
,
fst_indices
,
snd_indices
)
output_scores
=
self
.
net
(
input_ids
,
attn_mask
,
entity_indices
,
entity_mask
)
_
,
output_labels
=
torch
.
max
(
output_scores
.
data
,
1
)
_
,
output_labels
=
torch
.
max
(
output_scores
.
data
,
1
)
outputs
+=
output_labels
.
tolist
()
outputs
+=
output_labels
.
tolist
()
targets
+=
target_
labels
.
tolist
()
targets
+=
labels
.
tolist
()
assert
len
(
outputs
)
==
len
(
targets
)
assert
len
(
outputs
)
==
len
(
targets
)
...
@@ -176,25 +175,24 @@ class BertRelExtractor:
...
@@ -176,25 +175,24 @@ class BertRelExtractor:
recall
=
metrics
.
recall_score
(
targets
,
outputs
,
average
=
None
)
recall
=
metrics
.
recall_score
(
targets
,
outputs
,
average
=
None
)
print
(
'recall:'
,
recall
)
print
(
'recall:'
,
recall
)
def
extract_single_relation
(
self
,
text
,
e
1
,
e2
):
def
extract_single_relation
(
self
,
text
,
e
ntities
):
ins
=
PairRelDataset
.
get_insta
nce
(
text
,
e
1
,
e2
)
ins
=
RelInstance
.
from_sente
nce
(
text
,
e
ntities
)
input_ids
,
attn_mask
,
masked_indices
,
prod_indices
,
feat_indices
,
instances
=
generate_production_batch
([
ins
])
input_ids
,
attn_mask
,
entity_indices
,
entity_mask
,
_
=
generate_production_batch
([
ins
])
self
.
net
.
cuda
()
self
.
net
.
cuda
()
self
.
net
.
eval
()
self
.
net
.
eval
()
with
torch
.
no_grad
():
with
torch
.
no_grad
():
# send batch to gpu
# send batch to gpu
input_ids
,
attn_mask
,
masked_indices
,
prod_indices
,
feat_indices
=
tuple
(
i
.
to
(
device
)
for
i
in
input_ids
,
attn_mask
,
entity_indices
,
entity_mask
=
tuple
(
i
.
to
(
device
)
for
i
in
[
input_ids
,
attn_mask
,
[
input_ids
,
attn_mask
,
entity_indices
,
masked_indices
,
prod_indices
,
entity_mask
])
feat_indices
])
# forward pass
# forward pass
output_scores
=
softmax
(
self
.
net
(
input_ids
,
attn_mask
,
masked_indices
,
prod_indices
,
feat_indices
),
dim
=
1
)
output_scores
=
softmax
(
self
.
net
(
input_ids
,
attn_mask
,
entity_indices
,
entity_mask
),
dim
=
1
)
_
,
output_labels
=
torch
.
max
(
output_scores
.
data
,
1
)
_
,
output_labels
=
torch
.
max
(
output_scores
.
data
,
1
)
print
(
instances
[
0
].
get_relation
_for_label
(
output_labels
[
0
])
)
ins
.
print
_results
_for_label
s
(
output_labels
)
def
extract_relations
(
self
,
n_aspects
,
aspect_index_map
,
aspect_counts
,
file_path
=
None
,
dataset
=
None
,
size
=
None
):
def
extract_relations
(
self
,
n_aspects
,
aspect_index_map
,
aspect_counts
,
file_path
=
None
,
dataset
=
None
,
size
=
None
):
# load data
# load data
...
@@ -215,15 +213,14 @@ class BertRelExtractor:
...
@@ -215,15 +213,14 @@ class BertRelExtractor:
count_matrix
=
np
.
zeros
((
n_aspects
,
n_aspects
))
count_matrix
=
np
.
zeros
((
n_aspects
,
n_aspects
))
with
torch
.
no_grad
():
with
torch
.
no_grad
():
for
input_ids
,
attn_mask
,
masked_indices
,
prod_indices
,
feat_indices
,
instances
in
loader
:
for
input_ids
,
attn_mask
,
prod_indices
,
feat_indices
,
instances
in
loader
:
# send batch to gpu
# send batch to gpu
input_ids
,
attn_mask
,
masked_indices
,
prod_indices
,
feat_indices
=
tuple
(
i
.
to
(
device
)
for
i
in
input_ids
,
attn_mask
,
prod_indices
,
feat_indices
=
tuple
(
i
.
to
(
device
)
for
i
in
[
input_ids
,
attn_mask
,
[
input_ids
,
attn_mask
,
prod_indices
,
masked_indices
,
prod_indices
,
feat_indices
])
feat_indices
])
# forward pass
# forward pass
output_scores
=
softmax
(
self
.
net
(
input_ids
,
attn_mask
,
masked_indices
,
prod_indices
,
feat_indices
),
dim
=
1
)
output_scores
=
softmax
(
self
.
net
(
input_ids
,
attn_mask
,
prod_indices
,
feat_indices
),
dim
=
1
)
rel_scores
=
output_scores
.
narrow
(
1
,
1
,
2
)
rel_scores
=
output_scores
.
narrow
(
1
,
1
,
2
)
for
ins
,
scores
in
zip
(
instances
,
rel_scores
.
tolist
()):
for
ins
,
scores
in
zip
(
instances
,
rel_scores
.
tolist
()):
...
@@ -236,4 +233,38 @@ class BertRelExtractor:
...
@@ -236,4 +233,38 @@ class BertRelExtractor:
return
prob_matrix
,
count_matrix
return
prob_matrix
,
count_matrix
def
extract_relations2
(
self
,
n_aspects
,
dataset
):
loader
=
DataLoader
(
dataset
,
batch_size
=
BATCH_SIZE
,
shuffle
=
False
,
num_workers
=
4
,
collate_fn
=
generate_production_batch
)
self
.
net
.
cuda
()
self
.
net
.
eval
()
prob_matrix
=
np
.
zeros
((
n_aspects
,
n_aspects
))
count_matrix
=
np
.
zeros
((
n_aspects
,
n_aspects
))
with
torch
.
no_grad
():
for
input_ids
,
attn_mask
,
entity_indices
,
combination_indices
,
instances
in
loader
:
# send batch to gpu
input_ids
,
attn_mask
,
entity_indices
,
combination_indices
=
tuple
(
i
.
to
(
device
)
for
i
in
[
input_ids
,
attn_mask
,
entity_indices
,
combination_indices
])
# forward pass