PyTorch 详细知识点总结
1. PyTorch 基础概念
1.1 张量(Tensor)
张量是PyTorch中的基本数据结构 类似于多维数组,支持GPU加速 主要操作:python">import torch
x = torch. tensor( [ 1 , 2 , 3 ] )
y = torch. zeros( 2 , 3 )
z = torch. randn( 3 , 4 )
a = x + y
b = torch. matmul( y, z)
c = x. mean( )
1.2 自动求导(Autograd)
2. 神经网络构建
2.1 nn.Module
PyTorch中构建神经网络的基类 包含网络层定义和前向传播 示例:python">import torch. nn as nnclass Net ( nn. Module) : def __init__ ( self) : super ( Net, self) . __init__( ) self. fc1 = nn. Linear( 784 , 128 ) self. fc2 = nn. Linear( 128 , 10 ) self. relu = nn. ReLU( ) def forward ( self, x) : x = self. relu( self. fc1( x) ) x = self. fc2( x) return x
2.2 常用层
Linear:全连接层 Conv2d:2D卷积层 MaxPool2d:最大池化层 BatchNorm2d:批归一化 Dropout:防止过拟合
3. 数据处理
3.1 Dataset和DataLoader
Dataset:定义数据集 DataLoader:批量加载数据 示例:python">from torch. utils. data import Dataset, DataLoaderclass CustomDataset ( Dataset) : def __init__ ( self, data, labels) : self. data = dataself. labels = labelsdef __len__ ( self) : return len ( self. data) def __getitem__ ( self, idx) : return self. data[ idx] , self. labels[ idx]
dataset = CustomDataset( data, labels)
dataloader = DataLoader( dataset, batch_size= 32 , shuffle= True )
4. 模型训练
4.1 优化器和损失函数
常用优化器:SGD、Adam 常用损失函数:CrossEntropyLoss、MSELoss 示例:python">criterion = nn. CrossEntropyLoss( )
optimizer = torch. optim. Adam( model. parameters( ) , lr= 0.001 )
for epoch in range ( num_epochs) : for data, labels in dataloader: optimizer. zero_grad( ) outputs = model( data) loss = criterion( outputs, labels) loss. backward( ) optimizer. step( )
4.2 模型保存与加载
python">
torch. save( model. state_dict( ) , 'model.pth' )
model. load_state_dict( torch. load( 'model.pth' ) )
5. GPU加速
5.1 设备管理
python">device = torch. device( 'cuda' if torch. cuda. is_available( ) else 'cpu' )
model = model. to( device)
data = data. to( device)
6. 高级特性
6.1 分布式训练
DistributedDataParallel:多GPU训练 DataParallel:简单的数据并行
6.2 TorchScript
将PyTorch模型转换为可优化的格式 支持在生产环境中部署
6.3 模型量化
7. 调试与优化
7.1 内存优化
使用del释放不需要的张量 使用torch.no_grad()减少内存使用 梯度累积处理大批量数据
7.2 性能分析
torch.autograd.profiler nvprof性能分析 内存泄漏检测
8. 最佳实践
8.1 代码规范
使用nn.Sequential组织网络层 适当使用nn.ModuleList和nn.ParameterList 正确处理batch维度
8.2 训练技巧
8.3 部署考虑
模型导出(ONNX) 服务化部署 移动端部署 边缘设备部署
9. 常见问题解决方案
9.1 数据预处理
python">
from torchvision import transformstransform = transforms. Compose( [ transforms. Resize( ( 224 , 224 ) ) , transforms. ToTensor( ) , transforms. Normalize( mean= [ 0.485 , 0.456 , 0.406 ] , std= [ 0.229 , 0.224 , 0.225 ] )
] )
from torch. nn. utils. rnn import pad_sequencedef text_preprocess ( text_list, vocab) : indices = [ [ vocab[ word] for word in text. split( ) ] for text in text_list] padded = pad_sequence( [ torch. tensor( x) for x in indices] , batch_first= True ) return padded
9.2 模型评估
python">def evaluate_model ( model, test_loader, criterion, device) : model. eval ( ) total_loss = 0 correct = 0 total = 0 with torch. no_grad( ) : for data, target in test_loader: data, target = data. to( device) , target. to( device) output = model( data) total_loss += criterion( output, target) . item( ) pred = output. argmax( dim= 1 , keepdim= True ) correct += pred. eq( target. view_as( pred) ) . sum ( ) . item( ) total += target. size( 0 ) avg_loss = total_loss / len ( test_loader) accuracy = 100 . * correct / totalreturn avg_loss, accuracy
9.3 早停策略实现
python">class EarlyStopping : def __init__ ( self, patience= 7 , min_delta= 0 ) : self. patience = patienceself. min_delta = min_deltaself. counter = 0 self. best_loss = None self. early_stop = False def __call__ ( self, val_loss) : if self. best_loss is None : self. best_loss = val_losselif val_loss > self. best_loss - self. min_delta: self. counter += 1 if self. counter >= self. patience: self. early_stop = True else : self. best_loss = val_lossself. counter = 0
9.4 模型训练监控
python">class TrainingMonitor : def __init__ ( self) : self. history = { 'train_loss' : [ ] , 'val_loss' : [ ] , 'accuracy' : [ ] } def update ( self, metrics) : for k, v in metrics. items( ) : self. history[ k] . append( v) def plot_metrics ( self) : epochs = range ( 1 , len ( self. history[ 'train_loss' ] ) + 1 ) plt. figure( figsize= ( 12 , 4 ) ) plt. subplot( 1 , 2 , 1 ) plt. plot( epochs, self. history[ 'train_loss' ] , 'b-' , label= 'Training Loss' ) plt. plot( epochs, self. history[ 'val_loss' ] , 'r-' , label= 'Validation Loss' ) plt. title( 'Training and Validation Loss' ) plt. xlabel( 'Epochs' ) plt. ylabel( 'Loss' ) plt. legend( ) plt. subplot( 1 , 2 , 2 ) plt. plot( epochs, self. history[ 'accuracy' ] , 'g-' , label= 'Accuracy' ) plt. title( 'Model Accuracy' ) plt. xlabel( 'Epochs' ) plt. ylabel( 'Accuracy' ) plt. legend( ) plt. tight_layout( ) plt. show( )
9.5 实际应用场景示例
python">
from torchvision. models import resnet50def create_transfer_model ( num_classes) : model = resnet50( pretrained= True ) for param in model. parameters( ) : param. requires_grad = False model. fc = nn. Linear( model. fc. in_features, num_classes) return model
class EnsembleModel ( nn. Module) : def __init__ ( self, models) : super ( ) . __init__( ) self. models = nn. ModuleList( models) def forward ( self, x) : outputs = [ model( x) for model in self. models] return torch. stack( outputs) . mean( 0 )
class FocalLoss ( nn. Module) : def __init__ ( self, alpha= 1 , gamma= 2 ) : super ( ) . __init__( ) self. alpha = alphaself. gamma = gammadef forward ( self, inputs, targets) : ce_loss = F. cross_entropy( inputs, targets, reduction= 'none' ) pt = torch. exp( - ce_loss) focal_loss = self. alpha * ( 1 - pt) ** self. gamma * ce_lossreturn focal_loss. mean( )
total_loss = 0
correct = 0
total = 0with torch.no_grad():for data, targets in test_loader:data, targets = data.to(device), targets.to(device)outputs = model(data)loss = criterion(outputs, targets)total_loss += loss.item()_, predicted = outputs.max(1)total += targets.size(0)correct += predicted.eq(targets).sum().item()accuracy = 100. * correct / total
avg_loss = total_loss / len(test_loader)
return avg_loss, accuracy
### 9.3 常见错误处理
```python
# 1. CUDA内存不足
try:# 较大的批量处理output = model(large_input)
except RuntimeError as e:if "out of memory" in str(e):# 清理缓存torch.cuda.empty_cache()# 减小批量大小重试output = model(large_input.split(2))# 2. 梯度爆炸处理
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)# 3. 模型并行化错误处理
if torch.cuda.device_count() > 1:try:model = nn.DataParallel(model)except RuntimeError as e:print(f"并行化失败: {e}")# 回退到单GPUmodel = model.to(device)
9.4 性能优化技巧
python">
from torch. cuda. amp import autocast, GradScalerscaler = GradScaler( ) for data, targets in train_loader: with autocast( ) : output = model( data) loss = criterion( output, targets) scaler. scale( loss) . backward( ) scaler. step( optimizer) scaler. update( )
train_loader = DataLoader( dataset, batch_size= 32 , shuffle= True , num_workers= 4 , pin_memory= True
)
with torch. no_grad( ) : model. eval ( ) traced_model = torch. jit. trace( model, torch. randn( 1 , 3 , 224 , 224 ) ) output = traced_model( input_data)
9.5 实际应用场景示例
python">
from torchvision. models import resnet50def create_transfer_model ( num_classes) : model = resnet50( pretrained= True ) for param in model. parameters( ) : param. requires_grad = False model. fc = nn. Linear( model. fc. in_features, num_classes) return model
class EnsembleModel ( nn. Module) : def __init__ ( self, models) : super ( ) . __init__( ) self. models = nn. ModuleList( models) def forward ( self, x) : outputs = [ model( x) for model in self. models] return torch. stack( outputs) . mean( 0 )
class FocalLoss ( nn. Module) : def __init__ ( self, alpha= 1 , gamma= 2 ) : super ( ) . __init__( ) self. alpha = alphaself. gamma = gammadef forward ( self, inputs, targets) : ce_loss = F. cross_entropy( inputs, targets, reduction= 'none' ) pt = torch. exp( - ce_loss) focal_loss = self. alpha * ( 1 - pt) ** self. gamma * ce_lossreturn focal_loss. mean( )
class SelfAttention ( nn. Module) : def __init__ ( self, dim) : super ( ) . __init__( ) self. query = nn. Linear( dim, dim) self. key = nn. Linear( dim, dim) self. value = nn. Linear( dim, dim) def forward ( self, x) : q = self. query( x) k = self. key( x) v = self. value( x) scores = torch. matmul( q, k. transpose( - 2 , - 1 ) ) / math. sqrt( q. size( - 1 ) ) attention = F. softmax( scores, dim= - 1 ) return torch. matmul( attention, v)
class Generator ( nn. Module) : def __init__ ( self, latent_dim, img_shape) : super ( ) . __init__( ) self. model = nn. Sequential( nn. Linear( latent_dim, 128 ) , nn. LeakyReLU( 0.2 ) , nn. Linear( 128 , 256 ) , nn. BatchNorm1d( 256 ) , nn. LeakyReLU( 0.2 ) , nn. Linear( 256 , np. prod( img_shape) ) , nn. Tanh( ) ) self. img_shape = img_shapedef forward ( self, z) : img = self. model( z) return img. view( img. size( 0 ) , * self. img_shape) class Discriminator ( nn. Module) : def __init__ ( self, img_shape) : super ( ) . __init__( ) self. model = nn. Sequential( nn. Linear( np. prod( img_shape) , 256 ) , nn. LeakyReLU( 0.2 ) , nn. Linear( 256 , 128 ) , nn. LeakyReLU( 0.2 ) , nn. Linear( 128 , 1 ) , nn. Sigmoid( ) ) def forward ( self, img) : img_flat = img. view( img. size( 0 ) , - 1 ) return self. model( img_flat)
10. 高级训练技巧
10.1 梯度累积
python">accumulation_steps = 4
optimizer. zero_grad( ) for i, ( data, target) in enumerate ( train_loader) : output = model( data) loss = criterion( output, target) / accumulation_stepsloss. backward( ) if ( i + 1 ) % accumulation_steps == 0 : optimizer. step( ) optimizer. zero_grad( )
10.2 学习率调度
python">
scheduler = torch. optim. lr_scheduler. CosineAnnealingLR( optimizer, T_max= 200 )
scheduler = torch. optim. lr_scheduler. CosineAnnealingWarmRestarts( optimizer, T_0= 50 , T_mult= 2 )
scheduler = torch. optim. lr_scheduler. OneCycleLR( optimizer, max_lr= 0.1 , steps_per_epoch= len ( train_loader) , epochs= num_epochs
)
10.3 模型蒸馏
python">class DistillationLoss ( nn. Module) : def __init__ ( self, alpha= 0.5 , temperature= 2.0 ) : super ( ) . __init__( ) self. alpha = alphaself. T = temperaturedef forward ( self, student_outputs, teacher_outputs, targets) : hard_loss = F. cross_entropy( student_outputs, targets) soft_loss = nn. KLDivLoss( reduction= 'batchmean' ) ( F. log_softmax( student_outputs/ self. T, dim= 1 ) , F. softmax( teacher_outputs/ self. T, dim= 1 ) ) * ( self. T * self. T) return self. alpha * hard_loss + ( 1 - self. alpha) * soft_loss
10.4 对抗训练
python">def fgsm_attack ( model, loss, data, epsilon, data_grad) : sign_data_grad = data_grad. sign( ) perturbed_data = data + epsilon * sign_data_gradperturbed_data = torch. clamp( perturbed_data, 0 , 1 ) return perturbed_datadef train_with_adversarial ( model, train_loader, optimizer, epsilon) : for data, target in train_loader: optimizer. zero_grad( ) output = model( data) loss = F. cross_entropy( output, target) loss. backward( ) data_grad = data. grad. dataperturbed_data = fgsm_attack( model, loss, data, epsilon, data_grad) output = model( perturbed_data) loss = F. cross_entropy( output, target) loss. backward( ) optimizer. step( )
10.5 半精度训练
python">
scaler = torch. cuda. amp. GradScaler( )
optimizer = torch. optim. Adam( model. parameters( ) ) for data, target in train_loader: optimizer. zero_grad( ) with torch. cuda. amp. autocast( ) : output = model( data) loss = criterion( output, target) scaler. scale( loss) . backward( ) scaler. step( optimizer) scaler. update( )
model. half( )
for data, target in train_loader: data = data. half( ) output = model( data) loss = criterion( output, target)