下面我提供了我几乎所有的PyTorch代码,包括初始化代码,以便您可以自己试用。您需要自己提供的唯一东西是单词embeddings(我相信您可以在网上找到许多word2vec模型)。第一个输入文件应该是一个带标记文本的文件,第二个输入文件应该是一个带有浮点数的文件,每行一个。因为我已经提供了所有的代码,这个问题可能看起来太大,太宽泛了。然而,我的问题是非常具体的,我认为:我的模型或训练循环中有什么问题导致我的模型没有或几乎没有改进。(结果见下文。)
我已经试着在适用的地方提供了很多注释,而且我还提供了形状转换,所以您不需要
有
运行代码以查看发生了什么。数据准备方法不重要。
其中最重要的部分是前向法
RegressorNet
,以及
RegressionNN
(诚然,这些名字选得不好)。我想错误就在那里。
from pathlib import Path
import time
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import gensim
from scipy.stats import pearsonr
from LazyTextDataset import LazyTextDataset
class RegressorNet(nn.Module):
def __init__(self, hidden_dim, embeddings=None, drop_prob=0.0):
super(RegressorNet, self).__init__()
self.hidden_dim = hidden_dim
self.drop_prob = drop_prob
self.word_embeddings = nn.Embedding.from_pretrained(embeddings)
self.word_embeddings.weight.requires_grad = False
self.w2v_rnode = nn.GRU(embeddings.size(1), hidden_dim, bidirectional=True, dropout=drop_prob)
self.dropout = nn.Dropout(drop_prob)
self.linear = nn.Linear(hidden_dim * 2, 1)
self.lrelu = nn.LeakyReLU()
def forward(self, batch_size, sentence_input):
embeds = self.word_embeddings(sentence_input)
w2v_out, _ = self.w2v_rnode(embeds.view(-1, batch_size, embeds.size(2)))
w2v_out = self.lrelu(w2v_out)
if self.drop_prob > 0:
w2v_out = self.dropout(w2v_out)
regression = self.linear(w2v_out[-1, :, :])
regression: torch.Size([128, 1])
return regression
class RegressionRNN:
def __init__(self, train_files=None, test_files=None, dev_files=None):
print('Using torch ' + torch.__version__)
self.datasets, self.dataloaders = RegressionRNN._set_data_loaders(train_files, test_files, dev_files)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model = self.w2v_vocab = self.criterion = self.optimizer = self.scheduler = None
@staticmethod
def _set_data_loaders(train_files, test_files, dev_files):
datasets = {
'train': LazyTextDataset(train_files) if train_files is not None else None,
'test': LazyTextDataset(test_files) if test_files is not None else None,
'valid': LazyTextDataset(dev_files) if dev_files is not None else None
}
dataloaders = {
'train': DataLoader(datasets['train'], batch_size=128, shuffle=True, num_workers=4) if train_files is not None else None,
'test': DataLoader(datasets['test'], batch_size=128, num_workers=4) if test_files is not None else None,
'valid': DataLoader(datasets['valid'], batch_size=128, num_workers=4) if dev_files is not None else None
}
return datasets, dataloaders
@staticmethod
def prepare_lines(data, split_on=None, cast_to=None, min_size=None, pad_str=None, max_size=None, to_numpy=False,
list_internal=False):
""" Converts the string input (line) to an applicable format. """
out = []
for line in data:
line = line.strip()
if split_on:
line = line.split(split_on)
line = list(filter(None, line))
else:
line = [line]
if cast_to is not None:
line = [cast_to(l) for l in line]
if min_size is not None and len(line) < min_size:
line += (min_size - len(line)) * ['@pad@']
elif max_size and len(line) > max_size:
line = line[:max_size]
if list_internal:
line = [[item] for item in line]
if to_numpy:
line = np.array(line)
out.append(line)
if to_numpy:
out = np.array(out)
return out
def prepare_w2v(self, data):
idxs = []
for seq in data:
tok_idxs = []
for word in seq:
try:
tok_idxs.append(self.w2v_vocab[word].index)
except KeyError:
tok_idxs.append(self.w2v_vocab['@unk@'].index)
idxs.append(tok_idxs)
idxs = torch.tensor(idxs, dtype=torch.long)
return idxs
def train(self, epochs=10):
valid_loss_min = np.Inf
train_losses, valid_losses = [], []
for epoch in range(1, epochs + 1):
epoch_start = time.time()
train_loss, train_results = self._train_valid('train')
valid_loss, valid_results = self._train_valid('valid')
try:
train_pearson = pearsonr(train_results['predictions'], train_results['targets'])
except FloatingPointError:
train_pearson = "Could not calculate Pearsonr"
try:
valid_pearson = pearsonr(valid_results['predictions'], valid_results['targets'])
except FloatingPointError:
valid_pearson = "Could not calculate Pearsonr"
train_loss = np.mean(train_loss)
valid_loss = np.mean(valid_loss)
train_losses.append(train_loss)
valid_losses.append(valid_loss)
print(f'----------\n'
f'Epoch {epoch} - completed in {(time.time() - epoch_start):.0f} seconds\n'
f'Training Loss: {train_loss:.6f}\t Pearson: {train_pearson}\n'
f'Validation loss: {valid_loss:.6f}\t Pearson: {valid_pearson}')
if valid_loss <= valid_loss_min and train_loss > valid_loss:
print(f'!! Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
if train_loss <= valid_loss:
print('!! Training loss is lte validation loss. Might be overfitting!')
if self.scheduler is not None:
self.scheduler.step(valid_loss)
print('Done training...')
def _train_valid(self, do):
""" Do training or validating. """
if do not in ('train', 'valid'):
raise ValueError("Use 'train' or 'valid' for 'do'.")
results = {'predictions': np.array([]), 'targets': np.array([])}
losses = np.array([])
self.model = self.model.to(self.device)
if do == 'train':
self.model.train()
torch.set_grad_enabled(True)
else:
self.model.eval()
torch.set_grad_enabled(False)
for batch_idx, data in enumerate(self.dataloaders[do], 1):
sentence = data[0]
target = data[-1]
curr_batch_size = target.size(0)
sentence = self.prepare_lines(sentence, split_on=' ', min_size=20, max_size=20)
sent_w2v_idxs = self.prepare_w2v(sentence)
target = torch.Tensor(self.prepare_lines(target, cast_to=float))
sent_w2v_idxs, target = sent_w2v_idxs.to(self.device), target.to(self.device)
pred = self.model(curr_batch_size, sentence_input=sent_w2v_idxs)
loss = self.criterion(pred, target)
if do == 'train':
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
pred = pred.detach().cpu().numpy()
target = target.cpu().numpy()
results['predictions'] = np.append(results['predictions'], pred, axis=None)
results['targets'] = np.append(results['targets'], target, axis=None)
losses = np.append(losses, float(loss))
torch.set_grad_enabled(True)
return losses, results
if __name__ == '__main__':
HIDDEN_DIM = 200
embed_p = Path('path-to.w2v_model').resolve()
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(str(embed_p))
w2v_model.add(['@pad@'], [np.zeros(w2v_model.vectors.shape[1])])
embed_weights = torch.FloatTensor(w2v_model.vectors)
regr = RegressionRNN(train_files=(r'train.tok.low.en',
r'train.cross'),
dev_files=(r'dev.tok.low.en',
r'dev.cross'),
test_files=(r'test.tok.low.en',
r'test.cross'))
regr.w2v_vocab = w2v_model.vocab
regr.model = RegressorNet(HIDDEN_DIM, embed_weights, drop_prob=0.2)
regr.criterion = nn.MSELoss()
regr.optimizer = optim.Adam(list(regr.model.parameters())[0:], lr=0.001)
regr.scheduler = optim.lr_scheduler.ReduceLROnPlateau(regr.optimizer, 'min', factor=0.1, patience=5, verbose=True)
regr.train(epochs=100)
对于LazyTextDataset,可以参考下面的类。
from torch.utils.data import Dataset
import linecache
class LazyTextDataset(Dataset):
def __init__(self, paths):
self.paths, self.labels_path = paths[:-1], paths[-1]
with open(self.labels_path, encoding='utf-8') as fhin:
lines = 0
for line in fhin:
if line.strip() != '':
lines += 1
self.num_entries = lines
def __getitem__(self, idx):
data = [linecache.getline(p, idx + 1) for p in self.paths]
label = linecache.getline(self.labels_path, idx + 1)
return (*data, label)
def __len__(self):
return self.num_entries
正如我之前所写的,我正在尝试将Keras模型转换为PyTorch。原始的Keras代码没有使用嵌入层,而是使用每个句子预先构建的word2vec向量作为输入。在下面的模型中,没有嵌入层。Keras摘要如下所示(我无法访问基本模型设置)。
Layer (type) Output Shape Param # Connected to
====================================================================================================
bidirectional_1 (Bidirectional) (200, 400) 417600
____________________________________________________________________________________________________
dropout_1 (Dropout) (200, 800) 0 merge_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (200, 1) 801 dropout_1[0][0]
====================================================================================================
作品
得到了预测和实际标签之间的+0.5皮尔逊相关性。不过,上面的Pythorch模型似乎根本不起作用。为了让您了解一下,以下是第一个纪元后的损失(均方误差)和皮尔逊(相关系数,p值):
Epoch 1 - completed in 11 seconds
Training Loss: 1.684495 Pearson: (-0.0006077809280690612, 0.8173368901481127)
Validation loss: 1.708228 Pearson: (0.017794288315261794, 0.4264098054188664)
在100世纪之后:
Epoch 100 - completed in 11 seconds
Training Loss: 1.660194 Pearson: (0.0020315421756790806, 0.4400929436716754)
Validation loss: 1.704910 Pearson: (-0.017288118524826892, 0.4396865964324158)
最后一个可能出错的指标是,对于我的140K行输入,在我的gtx1080ti上,每个epoch只需要10秒。我觉得他没什么大不了的,我猜这优化是不起作用的。不过,我不知道为什么。问题可能在我的列车环路或模型本身,但我找不到它。
-Keras模型
做
-训练速度太快了,不适合140K个句子
-训练后几乎没有改善。
我错过了什么?这个问题很可能出现在训练回路或网络结构中。