使用Google colab的GPU运行resnet
由于我的电脑显卡是AMD,跑代码每一个epoch都要超级超级久,偶然间听到了Google colab,于是用起来了,感觉非常爽
Colaboratory 是一个免费的 Jupyter 笔记本环境,不需要进行任何设置就可以使用,并且完全在云端运行。
借助 Colaboratory,可以编写和执行代码、保存和共享分析结果,以及利用强大的计算资源,所有这些都可通过浏览器免费使用。
今日依旧是菜菜小孙,感谢某东的Jikess
创建Google账户
手机号无法注册
一步一步跟着就行,不过会出现中国手机号无法注册的问题,令人头秃。
下载Google Chrome,把之前的极速版给卸载了,settings中language设置成英语为默认语言,把中文(简体)卸载掉,就结局了手机号无法注册的问题。
然后就可以创建了
进入Google colab
网站
访问该网站,需要 (嘻嘻嘻)https://colab.research.google.com/notebooks/welcome.ipynb
新建
在初次使用过colab后,登录你的谷歌云盘,你就会发现可以新建Colaboratory了,新建它。
new中可以新建folder,也可以新建colaboratory
使用GPU
更改运行时类型(None,CPU,GPU)
这个地方也需要连接一下
运行代码,挂载谷歌云盘
这一步很重要,Colab的运行原理实际上就是给你分配一台远程的带GPU的主机,所以它的原始路径不是你的谷歌云盘(也就是你的代码文件)所在的路径。所以第一步我们先要把谷歌云盘挂载带到那台远程主机上:
from google.colab import drivedrive.mount(\'/content/gdrive\')
输入并运行这行代码,会出来一个框让你输验证码,点击进入连接,按提示复制粘贴完成即可。
更改运行目录
import osos.chdir(\"/content/gdrive/My Drive/Pytorch_try\")
这是我的Pytorch_try文件夹
上传文件运行
点击文件夹名字,上传文件
上传文件运行
第一种方式是直接跟Jupyter Notebook一样运行,不过不知道为什么会报错
An exception has occurred, use %tb to see the full traceback. SystemExit: 2
于是尝试了上传文件运行
!python CIFAR_RES.py
就跑起来了,每个epoch才25s左右,感觉非常棒,感谢大佬指点https://www.jianshu.com/p/a42d69568966
下面是resnet训练CIFAR10
import torch as timport torchvision as tvimport torch.nn as nnimport torch.optim as optimimport torchvision.transforms as transformsfrom torchvision.transforms import ToPILImageimport torch.backends.cudnn as cudnnimport matplotlib.pyplot as pltimport datetimeimport argparse# 样本读取线程数WORKERS = 4# 网络参赛保存文件名PARAS_FN = \'cifar_resnet_params.pkl\'# minist数据存放位置ROOT = \'./data\'# 目标函数loss_func = nn.CrossEntropyLoss()# 最优结果best_acc = 0# 记录准确率,显示曲线global_train_acc = []global_test_acc = []\'\'\'残差块in_channels, out_channels:残差块的输入、输出通道数对第一层,in out channel都是64,其他层则不同对每一层,如果in out channel不同, stride是1,其他层则为2\'\'\'class ResBlock(nn.Module):def __init__(self, in_channels, out_channels, stride=1):super(ResBlock, self).__init__()# 残差块的第一个卷积# 通道数变换in->out,每一层(除第一层外)的第一个block# 图片尺寸变换:stride=2时,w-3+2 / 2 + 1 = w/2,w/2 * w/2# stride=1时尺寸不变,w-3+2 / 1 + 1 = wself.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)self.bn1 = nn.BatchNorm2d(out_channels)self.relu = nn.ReLU(inplace=True)# 残差块的第二个卷积# 通道数、图片尺寸均不变self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)self.bn2 = nn.BatchNorm2d(out_channels)# 残差块的shortcut# 如果残差块的输入输出通道数不同,则需要变换通道数及图片尺寸,以和residual部分相加# 输出:通道数*2 图片尺寸/2if in_channels != out_channels:self.downsample = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=2),nn.BatchNorm2d(out_channels))else:# 通道数相同,无需做变换,在forward中identity = xself.downsample = Nonedef forward(self, x):identity = xout = self.conv1(x)out = self.bn1(out)out = self.relu(out)out = self.conv2(out)out = self.bn2(out)if self.downsample is not None:identity = self.downsample(x)out += identityout = self.relu(out)return out\'\'\'定义网络结构\'\'\'class ResNet34(nn.Module):def __init__(self, block):super(ResNet34, self).__init__()# 初始卷积层核池化层self.first = nn.Sequential(# 卷基层1:7*7kernel,2stride,3padding,outmap:32-7+2*3 / 2 + 1,16*16nn.Conv2d(3, 64, 7, 2, 3),nn.BatchNorm2d(64),nn.ReLU(inplace=True),# 最大池化,3*3kernel,1stride(32的原始输入图片较小,不再缩小尺寸),1padding,# outmap:16-3+2*1 / 1 + 1,16*16nn.MaxPool2d(3, 1, 1))# 第一层,通道数不变self.layer1 = self.make_layer(block, 64, 64, 3, 1)# 第2、3、4层,通道数*2,图片尺寸/2self.layer2 = self.make_layer(block, 64, 128, 4, 2) # 输出8*8self.layer3 = self.make_layer(block, 128, 256, 6, 2) # 输出4*4self.layer4 = self.make_layer(block, 256, 512, 3, 2) # 输出2*2self.avg_pool = nn.AvgPool2d(2) # 输出512*1self.fc = nn.Linear(512, 10)def make_layer(self, block, in_channels, out_channels, block_num, stride):layers = []# 每一层的第一个block,通道数可能不同layers.append(block(in_channels, out_channels, stride))# 每一层的其他block,通道数不变,图片尺寸不变for i in range(block_num - 1):layers.append(block(out_channels, out_channels, 1))return nn.Sequential(*layers)def forward(self, x):x = self.first(x)x = self.layer1(x)x = self.layer2(x)x = self.layer3(x)x = self.layer4(x)x = self.avg_pool(x)# x.size()[0]: batch sizex = x.view(x.size()[0], -1)x = self.fc(x)return x\'\'\'训练并测试网络net:网络模型train_data_load:训练数据集optimizer:优化器epoch:第几次训练迭代log_interval:训练过程中损失函数值和准确率的打印频率\'\'\'def net_train(net, train_data_load, optimizer, epoch, log_interval):net.train()begin = datetime.datetime.now()# 样本总数total = len(train_data_load.dataset)# 样本批次训练的损失函数值的和train_loss = 0# 识别正确的样本数ok = 0for i, data in enumerate(train_data_load, 0):img, label = dataimg, label = img.cuda(), label.cuda()optimizer.zero_grad()outs = net(img)loss = loss_func(outs, label)loss.backward()optimizer.step()# 累加损失值和训练样本数train_loss += loss.item()_, predicted = t.max(outs.data, 1)# 累加识别正确的样本数ok += (predicted == label).sum()if (i + 1) % log_interval == 0:# 训练结果输出# 已训练的样本数traind_total = (i + 1) * len(label)# 准确度acc = 100. * ok / traind_total# 记录训练准确率以输出变化曲线global_train_acc.append(acc)end = datetime.datetime.now()print(\'one epoch spend: \', end - begin)\'\'\'用测试集检查准确率\'\'\'def net_test(net, test_data_load, epoch):net.eval()ok = 0for i, data in enumerate(test_data_load):img, label = dataimg, label = img.cuda(), label.cuda()outs = net(img)_, pre = t.max(outs.data, 1)ok += (pre == label).sum()acc = ok.item() * 100. / (len(test_data_load.dataset))print(\'EPOCH:{}, ACC:{}\\n\'.format(epoch, acc))# 记录测试准确率以输出变化曲线global_test_acc.append(acc)# 最好准确度记录global best_accif acc > best_acc:best_acc = acc\'\'\'显示数据集中一个图片\'\'\'def img_show(dataset, index):classes = (\'plane\', \'car\', \'bird\', \'cat\',\'deer\', \'dog\', \'frog\', \'horse\', \'ship\', \'truck\')show = ToPILImage()data, label = dataset[index]print(\'img is a \', classes[label])show((data + 1) / 2).resize((100, 100)).show()\'\'\'显示训练准确率、测试准确率变化曲线\'\'\'def show_acc_curv(ratio):# 训练准确率曲线的x、ytrain_x = list(range(len(global_train_acc)))train_y = global_train_acc# 测试准确率曲线的x、y# 每ratio个训练准确率对应一个测试准确率test_x = train_x[ratio-1::ratio]test_y = global_test_accplt.title(\'CIFAR10 RESNET34 ACC\')plt.plot(train_x, train_y, color=\'green\', label=\'training accuracy\')plt.plot(test_x, test_y, color=\'red\', label=\'testing accuracy\')# 显示图例plt.legend()plt.xlabel(\'iterations\')plt.ylabel(\'accs\')plt.show()def main():# 训练超参数设置,可通过命令行设置parser = argparse.ArgumentParser(description=\'PyTorch CIFA10 ResNet34 Example\')parser.add_argument(\'--batch-size\', type=int, default=128, metavar=\'N\',help=\'input batch size for training (default: 128)\')parser.add_argument(\'--test-batch-size\', type=int, default=100, metavar=\'N\',help=\'input batch size for testing (default: 100)\')parser.add_argument(\'--epochs\', type=int, default=200, metavar=\'N\',help=\'number of epochs to train (default: 200)\')parser.add_argument(\'--lr\', type=float, default=0.1, metavar=\'LR\',help=\'learning rate (default: 0.1)\')parser.add_argument(\'--momentum\', type=float, default=0.9, metavar=\'M\',help=\'SGD momentum (default: 0.9)\')parser.add_argument(\'--log-interval\', type=int, default=10, metavar=\'N\',help=\'how many batches to wait before logging training status (default: 10)\')parser.add_argument(\'--no-train\', action=\'store_true\', default=False,help=\'If train the Model\')parser.add_argument(\'--save-model\', action=\'store_true\', default=False,help=\'For Saving the current Model\')args = parser.parse_args()# 图像数值转换,ToTensor源码注释\"\"\"Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.Converts a PIL Image or numpy.ndarray (H x W x C) in the range[0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].\"\"\"# 归一化把[0.0, 1.0]变换为[-1,1], ([0, 1] - 0.5) / 0.5 = [-1, 1]transform = tv.transforms.Compose([transforms.ToTensor(),transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])# 定义数据集train_data = tv.datasets.CIFAR10(root=ROOT, train=True, download=True, transform=transform)test_data = tv.datasets.CIFAR10(root=ROOT, train=False, download=False, transform=transform)train_load = t.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=WORKERS)test_load = t.utils.data.DataLoader(test_data, batch_size=args.test_batch_size, shuffle=False, num_workers=WORKERS)net = ResNet34(ResBlock).cuda()print(net)# 并行计算提高运行速度net = nn.DataParallel(net)cudnn.benchmark = True# 如果不训练,直接加载保存的网络参数进行测试集验证if args.no_train:net.load_state_dict(t.load(PARAS_FN))net_test(net, test_load, 0)returnoptimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum)start_time = datetime.datetime.now()for epoch in range(1, args.epochs + 1):net_train(net, train_load, optimizer, epoch, args.log_interval)# 每个epoch结束后用测试集检查识别准确度net_test(net, test_load, epoch)end_time = datetime.datetime.now()global best_accprint(\'CIFAR10 pytorch ResNet34 Train: EPOCH:{}, BATCH_SZ:{}, LR:{}, ACC:{}\'.format(args.epochs, args.batch_size, args.lr, best_acc))print(\'train spend time: \', end_time - start_time)if args.save_model:t.save(net.state_dict(), PARAS_FN)if __name__ == \'__main__\':main()