Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Park, Se
NLP_CW
Commits
d80c5a59
Commit
d80c5a59
authored
Feb 09, 2020
by
Se Park
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Need to bugfix
parent
7d305f80
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
28 additions
and
24 deletions
+28
-24
__pycache__/dataloader.cpython-36.pyc
__pycache__/dataloader.cpython-36.pyc
+0
-0
__pycache__/model.cpython-36.pyc
__pycache__/model.cpython-36.pyc
+0
-0
dataloader.py
dataloader.py
+14
-12
main.py
main.py
+9
-7
model.py
model.py
+5
-5
No files found.
__pycache__/dataloader.cpython-36.pyc
View file @
d80c5a59
No preview for this file type
__pycache__/model.cpython-36.pyc
View file @
d80c5a59
No preview for this file type
dataloader.py
View file @
d80c5a59
...
...
@@ -52,24 +52,26 @@ class LoadData(Dataset):
# Insering the CLS and SEP token in the beginning and end of the sentence
tokens
=
[
"[CLS]"
]
+
src_tokens
+
[
"[SEP]"
]
+
mt_tokens
+
[
"[SEP]"
]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids
=
self
.
tokenizer
.
convert_tokens_to_ids
(
tokens
)
segment_ids
=
[
0
]
*
(
len
(
src_tokens
)
+
2
)
+
[
1
]
*
(
len
(
mt_tokens
)
+
1
)
attn_mask
=
[
1
]
*
len
(
tokens_ids
)
if
len
(
tokens
)
<
self
.
maxlen
:
# Padding sentences
tokens
=
tokens
+
[
'[PAD]'
for
_
in
range
(
self
.
maxlen
-
len
(
tokens
))]
segment_ids
=
segment_ids
+
[
0
]
*
(
self
.
maxlen
-
len
(
segment_ids
))
padding
=
[
0
]
*
(
self
.
maxlen
-
len
(
tokens_ids
))
tokens_ids
+=
padding
segment_ids
+=
padding
attn_mask
+=
padding
else
:
# Prunning the list to be of specified max length
tokens
=
tokens
[:
self
.
maxlen
-
1
]
+
[
'[SEP]'
]
tokens
_ids
=
tokens
_ids
[:
self
.
maxlen
]
segment_ids
=
segment_ids
[:
self
.
maxlen
]
attn_mask
=
attn_mask
[:
self
.
maxlen
]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids
=
self
.
tokenizer
.
convert_tokens_to_ids
(
tokens
)
# Converting the list to a pytorch tensor
tokens_ids_tensor
=
torch
.
tensor
(
tokens_ids
)
segment_ids_tensor
=
torch
.
tensor
(
segment_ids
,
dtype
=
torch
.
long
)
# Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
attn_mask
=
(
tokens_ids_tensor
!=
0
).
long
()
return
tokens_ids_tensor
,
segment_ids_tensor
,
attn_mask
,
score
tokens_ids
=
torch
.
tensor
(
tokens_ids
,
dtype
=
torch
.
long
)
segment_ids
=
torch
.
tensor
(
segment_ids
,
dtype
=
torch
.
long
)
attn_mask
=
torch
.
tensor
(
attn_mask
,
dtype
=
torch
.
long
)
return
tokens_ids
,
attn_mask
,
segment_ids
,
score
main.py
View file @
d80c5a59
...
...
@@ -21,7 +21,7 @@ def evaluate(model, loss_fn, dataloader, device):
for
token_ids
,
segment_ids
,
attn_masks
,
labels
in
dataloader
:
token_ids
,
segment_ids
,
attn_masks
,
labels
=
token_ids
.
to
(
device
),
segment_ids
.
to
(
device
),
attn_masks
.
to
(
device
),
labels
.
to
(
device
)
qe_scores
=
model
(
token_ids
,
segment_ids
,
attn_masks
)
loss
=
loss_fn
(
qe_scores
,
labels
)
loss
=
loss_fn
(
qe_scores
.
view
(
-
1
),
labels
.
float
()
)
qe_scores
=
qe_scores
.
detach
().
cpu
().
numpy
()
qe_scores
=
qe_scores
.
reshape
((
qe_scores
.
shape
[
0
],))
...
...
@@ -29,7 +29,7 @@ def evaluate(model, loss_fn, dataloader, device):
pred
=
np
.
concatenate
((
pred
,
qe_scores
))
ref
=
np
.
concatenate
((
ref
,
labels
))
eval_loss
+=
loss
.
item
()
count
+=
1
...
...
@@ -44,6 +44,7 @@ def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device
for
ep
in
range
(
num_epoch
):
print
(
'======= Epoch {:} ======='
.
format
(
ep
))
for
it
,
(
token_ids
,
segment_ids
,
attn_masks
,
labels
)
in
enumerate
(
train_loader
):
model
.
train
()
# Clear gradients
optimizer
.
zero_grad
()
# Converting these to cuda tensors
...
...
@@ -51,7 +52,7 @@ def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device
# Obtaining scores from the model
qe_scores
=
model
(
token_ids
,
segment_ids
,
attn_masks
)
# Computing loss
loss
=
loss_fn
(
qe_scores
,
labels
)
loss
=
loss_fn
(
qe_scores
.
view
(
-
1
),
labels
.
float
()
)
# Backpropagating the gradients
loss
.
backward
()
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
1.0
)
...
...
@@ -73,6 +74,7 @@ if __name__ == "__main__":
PATH
=
Path
(
"/vol/bitbucket/shp2918/nlp"
)
use_cuda
=
torch
.
cuda
.
is_available
()
# use_cuda = False
device
=
torch
.
device
(
'cuda'
if
use_cuda
else
'cpu'
)
print
(
"Using GPU: {}"
.
format
(
use_cuda
))
...
...
@@ -81,13 +83,13 @@ if __name__ == "__main__":
model
.
cuda
()
loss_fn
=
nn
.
MSELoss
()
optimizer
=
optim
.
Adam
W
(
model
.
parameters
(),
lr
=
2
e-5
)
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
5
e-5
)
MAX_LEN
=
128
MAX_LEN
=
64
train_set
=
LoadData
(
src_file
=
PATH
/
'data/train.ende.src'
,
mt_file
=
PATH
/
'data/train.ende.mt'
,
score_file
=
PATH
/
'data/train.ende.scores'
,
maxlen
=
MAX_LEN
)
val_set
=
LoadData
(
src_file
=
PATH
/
'data/dev.ende.src'
,
mt_file
=
PATH
/
'data/dev.ende.mt'
,
score_file
=
PATH
/
'data/dev.ende.scores'
,
maxlen
=
MAX_LEN
)
train_loader
=
DataLoader
(
train_set
,
batch_size
=
32
,
num_workers
=
5
)
val_loader
=
DataLoader
(
val_set
,
batch_size
=
32
,
num_workers
=
5
)
train_loader
=
DataLoader
(
train_set
,
batch_size
=
32
)
val_loader
=
DataLoader
(
val_set
,
batch_size
=
32
)
num_epoch
=
4
train
(
model
,
loss_fn
,
optimizer
,
train_loader
,
val_loader
,
num_epoch
,
device
)
model.py
View file @
d80c5a59
...
...
@@ -10,16 +10,15 @@ class QualityEstimation(nn.Module):
# Instantiating BERT model object
config
=
BertConfig
()
self
.
bert
=
BertModel
(
config
)
self
.
bert
=
BertModel
(
config
)
.
from_pretrained
(
'bert-base-multilingual-cased'
)
self
.
dropout
=
nn
.
Dropout
(
0.25
)
# LSTM and classification layers
self
.
lstm
=
nn
.
LSTM
(
input_size
=
config
.
hidden_size
,
hidden_size
=
self
.
hidden_dim
,
num_layers
=
1
,
batch_first
=
True
,
dropout
=
0
,
bidirectional
=
False
)
self
.
lstm
=
nn
.
LSTM
(
input_size
=
768
,
hidden_size
=
self
.
hidden_dim
,
num_layers
=
1
,
batch_first
=
True
,
dropout
=
0
,
bidirectional
=
False
)
self
.
fc1
=
nn
.
Linear
(
self
.
hidden_dim
,
self
.
hidden_dim
)
self
.
fc2
=
nn
.
Linear
(
self
.
hidden_dim
,
1
)
self
.
loss
=
nn
.
MSELoss
()
def
forward
(
self
,
token_ids
,
segment_ids
=
None
,
attention_mask
=
None
):
...
...
@@ -29,6 +28,7 @@ class QualityEstimation(nn.Module):
flat_attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
encoded_layers
,
_
=
self
.
bert
(
flat_token_ids
,
flat_segment_ids
,
flat_attention_mask
)
# encoded_layers, _ = self.bert(input_ids=token_ids, token_type_ids=segment_ids, attention_mask=attention_mask)
encoded_layers
=
self
.
dropout
(
encoded_layers
)
output
,
_
=
self
.
lstm
(
encoded_layers
)
output
=
torch
.
tanh
(
self
.
fc1
(
output
[:,
-
1
,:]))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment