代码之家  ›  专栏  ›  技术社区  ›  RichieZhu

如何防止caffe中的重量更新

  •  2
  • RichieZhu  · 技术社区  · 7 年前

    我的一些净负荷层是一个预训练模型。我想修复它们的参数并训练其他层。

    我跟着 this page 并设置 lr_multi decay_multi propagate_down: false ,甚至 base_lr: 0 weight_decay: 0

      layer {
        name: "data"
        type: "ImageData"
        top: "data"
        top: "label"
        include {
          phase: TRAIN
        }
        transform_param {
          scale: 0.017
          mirror: true
          crop_size: 32
          mean_value: 115
          mean_value: 126
          mean_value: 130
          color: true
          contrast: true
          brightness: true
        }
        image_data_param {
          source: "/data/zhuhao5/data/cifar100/cifar100_train_replicate.txt"
          batch_size: 64
          shuffle: true
          #pair_size: 3
        }
      }
      layer {
        name: "data"
        type: "ImageData"
        top: "data"
        top: "label"
        include {
          phase: TEST
        }
        transform_param {
          scale: 0.017
          mirror: false
          crop_size: 32
          mean_value: 115
          mean_value: 126
          mean_value: 130
        }
        image_data_param {
          source: "/data/zhuhao5/data/cifar100/cifar100_test.txt"
          batch_size: 100
          shuffle: false
        }
      }
      #-------------- TEACHER --------------------
      layer {
        name: "conv1"
        type: "Convolution"
        bottom: "data"
        propagate_down: false
        top: "conv1"
        param { 
          lr_mult: 0 
          decay_mult: 0 
        }
        convolution_param {
          num_output: 16
          bias_term: false
          pad: 1
          kernel_size: 3
          stride: 1
          weight_filler {
            type: "msra"
          }
        }
      }
      layer {
        name: "res2_1a_1_bn"
        type: "BatchNorm"
        bottom: "conv1"
        propagate_down: false
        top: "res2_1a_1_bn"
        param { 
          lr_mult: 0 
          decay_mult: 0 
        }
            param { 
          lr_mult: 0 
          decay_mult: 0 
        }
      }
      layer {
        name: "res2_1a_1_scale"
        type: "Scale"
        bottom: "res2_1a_1_bn"
        propagate_down: false
        top: "res2_1a_1_bn"
          param { 
          lr_mult: 0 
          decay_mult: 0 
        }
        scale_param {
          bias_term: true
        }
      }
      layer {
        name: "res2_1a_1_relu"
        type: "ReLU"
        bottom: "res2_1a_1_bn"
        propagate_down: false
        top: "res2_1a_1_bn"
      }
      layer {
        name: "pool_5"
        type: "Pooling"
        bottom: "res2_1a_1_bn"
        propagate_down: false
        top: "pool_5"
        pooling_param {
          pool: AVE
          global_pooling: true
        }
      }
      layer {
        name: "fc100"
        type: "InnerProduct"
        bottom: "pool_5"
        propagate_down: false
        top: "fc100"
        param {
          lr_mult: 0
          decay_mult: 0
        }
        param {
          lr_mult: 0
          decay_mult: 0
        }
        inner_product_param {
          num_output: 100
          weight_filler {
            type: "msra"
          }
          bias_filler {
            type: "constant"
            value: 0
          }
        }
      }
      #---------------------------------
      layer {
        name: "tea_soft_loss"
        type: "SoftmaxWithLoss"
        bottom: "fc100"
        bottom: "label"
        propagate_down: false
        propagate_down: false
        top: "tea_soft_loss"
        loss_weight: 0
      }
    
      ##----------- ACCURACY----------------
    
      layer {
        name: "teacher_accuracy"
        type: "Accuracy"
        bottom: "fc100"
        bottom: "label"
        top: "teacher_accuracy"
        accuracy_param {
          top_k: 1
        }
      }
    

    这是求解器:

    test_iter: 100
    
    test_interval: 10
    
    base_lr: 0
    momentum: 0
    weight_decay: 0
    
    lr_policy: "poly"
    power: 1
    
    display: 10000
    
    max_iter: 80000
    
    snapshot: 5000
    
    type: "SGD"
    
    solver_mode: GPU
    
    random_seed: 10086
    

    I0829 16:31:39.363433 14986 net.cpp:200] teacher_accuracy does not need backward computation.
    I0829 16:31:39.363438 14986 net.cpp:200] tea_soft_loss does not need backward computation.
    I0829 16:31:39.363442 14986 net.cpp:200] fc100_fc100_0_split does not need backward computation.
    I0829 16:31:39.363446 14986 net.cpp:200] fc100 does not need backward computation.
    I0829 16:31:39.363451 14986 net.cpp:200] pool_5 does not need backward computation.
    I0829 16:31:39.363454 14986 net.cpp:200] res2_1a_1_relu does not need backward computation.
    I0829 16:31:39.363458 14986 net.cpp:200] res2_1a_1_scale does not need backward computation.
    I0829 16:31:39.363462 14986 net.cpp:200] res2_1a_1_bn does not need backward computation.
    I0829 16:31:39.363466 14986 net.cpp:200] conv1 does not need backward computation.
    I0829 16:31:39.363471 14986 net.cpp:200] label_data_1_split does not need backward computation.
    I0829 16:31:39.363485 14986 net.cpp:200] data does not need backward computation.
    I0829 16:31:39.363490 14986 net.cpp:242] This network produces output tea_soft_loss
    I0829 16:31:39.363494 14986 net.cpp:242] This network produces output teacher_accuracy
    I0829 16:31:39.363507 14986 net.cpp:255] Network initialization done.
    I0829 16:31:39.363559 14986 solver.cpp:56] Solver scaffolding done.
    I0829 16:31:39.363852 14986 caffe.cpp:248] Starting Optimization
    I0829 16:31:39.363862 14986 solver.cpp:272] Solving WRN_22_12_to_WRN_18_4_v5_net
    I0829 16:31:39.363865 14986 solver.cpp:273] Learning Rate Policy: poly
    I0829 16:31:39.365981 14986 solver.cpp:330] Iteration 0, Testing net (#0)
    I0829 16:31:39.366190 14986 blocking_queue.cpp:49] Waiting for data
    I0829 16:31:39.742347 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 85.9064
    I0829 16:31:39.742437 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0113
    I0829 16:31:39.749806 14986 solver.cpp:218] Iteration 0 (0 iter/s, 0.385886s/10000 iters), loss = 0
    I0829 16:31:39.749862 14986 solver.cpp:237]     Train net output #0: tea_soft_loss = 4.97483
    I0829 16:31:39.749877 14986 solver.cpp:237]     Train net output #1: teacher_accuracy = 0
    I0829 16:31:39.749908 14986 sgd_solver.cpp:105] Iteration 0, lr = 0
    I0829 16:31:39.794306 14986 solver.cpp:330] Iteration 10, Testing net (#0)
    I0829 16:31:40.171447 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.9119
    I0829 16:31:40.171510 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0115
    I0829 16:31:40.219133 14986 solver.cpp:330] Iteration 20, Testing net (#0)
    I0829 16:31:40.596911 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.91862
    I0829 16:31:40.596971 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0116
    I0829 16:31:40.645246 14986 solver.cpp:330] Iteration 30, Testing net (#0)
    I0829 16:31:41.021711 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.92105
    I0829 16:31:41.021772 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0117
    I0829 16:31:41.069464 14986 solver.cpp:330] Iteration 40, Testing net (#0)
    I0829 16:31:41.447345 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.91916
    I0829 16:31:41.447407 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0117
    I0829 16:31:41.495157 14986 solver.cpp:330] Iteration 50, Testing net (#0)
    I0829 16:31:41.905607 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.9208
    I0829 16:31:41.905654 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0117
    I0829 16:31:41.952659 14986 solver.cpp:330] Iteration 60, Testing net (#0)
    I0829 16:31:42.327942 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.91936
    I0829 16:31:42.328025 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0117
    I0829 16:31:42.374279 14986 solver.cpp:330] Iteration 70, Testing net (#0)
    I0829 16:31:42.761359 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.91859
    I0829 16:31:42.761430 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0117
    I0829 16:31:42.807821 14986 solver.cpp:330] Iteration 80, Testing net (#0)
    I0829 16:31:43.232321 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.91668
    I0829 16:31:43.232398 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0117
    I0829 16:31:43.266436 14986 solver.cpp:330] Iteration 90, Testing net (#0)
    I0829 16:31:43.514633 14986 blocking_queue.cpp:49] Waiting for data
    I0829 16:31:43.638617 14986 solver.cpp:397]     Test net output #0: tea_soft_loss = 4.91836
    I0829 16:31:43.638684 14986 solver.cpp:397]     Test net output #1: teacher_accuracy = 0.0117
    I0829 16:31:43.685451 14986 solver.cpp:330] Iteration 100, Testing net (#0)
    

    我想知道我在caffe的更新过程中错过了什么:(

    1 回复  |  直到 7 年前
        1
  •  4
  •   RichieZhu    7 年前

    找到了原因。

    BatchNorm use_global_stats 在列车和测试阶段。

    在我的问题中,我应该设置 use_global_stats: true

    也不要忘记 Scale

    修改后的图层应为

    layer {
      name: "res2_1a_1_bn"
      type: "BatchNorm"
      bottom: "conv1"
      top: "res2_1a_1_bn"
      batch_norm_param {
          use_global_stats: true
      }
    }
    layer {
      name: "res2_1a_1_scale"
      type: "Scale"
      bottom: "res2_1a_1_bn"
      top: "res2_1a_1_bn"
      param {
        lr_mult: 0
        decay_mult: 0
      }
      param {
        lr_mult: 0
        decay_mult: 0
      }
      scale_param {
        bias_term: true
      }
    }