-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathclasses.py
More file actions
125 lines (99 loc) · 5.06 KB
/
classes.py
File metadata and controls
125 lines (99 loc) · 5.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# coding: utf-8
# In[2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
# In[3]:
class biLM(nn.Module):
'''
initialize with
embedding: pre-trained embedding layer
hidden_size: size of hidden_states of biLM
n_layers: number of layers
dropout: dropout
'''
def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
super(biLM, self).__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = embedding
USE_CUDA = torch.cuda.is_available()
self.device = torch.device("cuda" if USE_CUDA else "cpu")
self.drop = nn.Dropout(p=dropout)
self.forwardLSTM = nn.LSTM(hidden_size,
hidden_size,
n_layers,
dropout=(0 if n_layers == 1 else dropout))
self.backwardLSTM = nn.LSTM(hidden_size,
hidden_size,
n_layers,
dropout=(0 if n_layers == 1 else dropout))
def forward(self, input_seq, input_lengths, initial_states=None):
'''
input_seq: size=(MAX_LEN, batch_size)
input_lengths: contains length of each sentence
initial_states: tuple of initial hidden_state of LSTM, initial cell state of LSTM
'''
embedded = self.embedding(input_seq)
MAX_LEN = embedded.size()[0]
batch_size = embedded.size()[1]
# embedded: size=(MAX_LEN, batch_size, hidden_size)
outputs = torch.zeros(MAX_LEN, batch_size, 2, self.hidden_size, device=self.device)
hidden_states = torch.zeros(self.n_layers * 2, MAX_LEN, batch_size, self.hidden_size, device=self.device)
if not initial_states:
initial_states = (torch.zeros(self.n_layers, 1, self.hidden_size, device=self.device), torch.zeros(self.n_layers, 1, self.hidden_size, device=self.device))
for batch_n in range(batch_size):
b_sentence = embedded[:,batch_n, :]
length = input_lengths[batch_n]
sentence = self.drop(b_sentence[:length,:])
hidden_forward_state, cell_forward_state = initial_states
hidden_backward_state, cell_backward_state = initial_states
for t in range(length):
output, (hidden_forward_state, cell_forward_state) = self.forwardLSTM(sentence[t].view(1, 1, -1), (hidden_forward_state, cell_forward_state))
outputs[t, batch_n, 0, :] = output[0, 0, :]
hidden_states[:self.n_layers, t, batch_n, :] = hidden_forward_state[:, 0, :]
for t in range(length):
output, (hidden_backward_state, cell_backward_state) = self.backwardLSTM(sentence[length - t - 1].view(1, 1, -1), (hidden_backward_state, cell_backward_state))
outputs[length - t - 1, batch_n, 1, :] = output[0, 0, :]
hidden_states[self.n_layers:, length - t - 1, batch_n, :] = hidden_backward_state[:, 0, :]
return outputs, hidden_states, embedded
# In[4]:
class ELMo(nn.Module):
'''
initialize with
'''
def __init__(self, hidden_size, embedding, n_layers=1, dropout=0, l2_coef=None, do_layer_norm=False):
super(ELMo, self).__init__()
USE_CUDA = torch.cuda.is_available()
self.device = torch.device("cuda" if USE_CUDA else "cpu")
self.hidden_size = hidden_size
self.l2_coef = l2_coef
self.do_layer_norm = do_layer_norm
self.n_layers = n_layers
self.biLM = biLM(hidden_size, embedding, n_layers, dropout)
self.W = nn.Parameter(torch.tensor([1/(2*n_layers + 1) for i in range(2*n_layers + 1)], requires_grad=True, device=self.device))
self.gamma = nn.Parameter(torch.ones(1, requires_grad=True, device=self.device))
def do_norm(layer, mask):
masked_layer = layer * mask
N = torch.sum(mask) * self.hidden_size
mean = torch.sum(masked_layer)/N
variance = torch.sum(((masked_layer - mean) * mask) ** 2) / N
return F.batch_norm(layer, mean, variance)
def forward(self, input_seq, input_lengths, mask, initial_states=None):
bilm_outputs, hidden_states, embedded = self.biLM(input_seq, input_lengths, initial_states)
concat_hidden_with_embedding = torch.cat((embedded.unsqueeze(0), hidden_states), dim=0)
ELMo_embedding = torch.zeros(*embedded.size(), device=self.device)
for i in range(2*self.n_layers + 1):
w = self.W[i]
layer = concat_hidden_with_embedding[i]
if self.do_layer_norm:
layer = self.do_norm(layer, mask)
ELMo_embedding = ELMo_embedding + w * layer
ELMo_embedding *= self.gamma
return ELMo_embedding, bilm_outputs
# In[ ]: