【Caffe源码】初始化方法及随机数生成
首先想要初始化,就需要有一个随机数生成器,我们先看看类结构。
在common.hpp
有
class RNG {
public:
RNG();
explicit RNG(unsigned int seed);
explicit RNG(const RNG&);
RNG& operator=(const RNG&);
void* generator();
private:
class Generator;
shared_ptr<Generator> generator_;
};
在common.cpp
中有
CPU版本
class Caffe::RNG::Generator {
public:
Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
caffe::rng_t* rng() { return rng_.get(); }
private:
shared_ptr<caffe::rng_t> rng_;
};
Caffe::RNG::RNG() : generator_(new Generator()) { }
Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { }
Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
generator_ = other.generator_;
return *this;
}
void* Caffe::RNG::generator() {
return static_cast<void*>(generator_->rng());
}
GPU版本
class Caffe::RNG::Generator {
public:
Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
caffe::rng_t* rng() { return rng_.get(); }
private:
shared_ptr<caffe::rng_t> rng_;
};
Caffe::RNG::RNG() : generator_(new Generator()) { }
Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { }
Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
generator_.reset(other.generator_.get());
return *this;
}
void* Caffe::RNG::generator() {
return static_cast<void*>(generator_->rng());
}
而随机数产生具体实现在util/rng.hpp
,可以看到使用了
boost/random/mersenne_twister.hpp
boost/random/uniform_int.hpp
这两个头文件,其中第一个是Mersenne Twister算法。详细介绍点此。
Mersenne Twister这个名字来自周期长度通常取Mersenne质数这样一个事实。常见的有两个变种Mersenne Twister MT19937和Mersenne Twister MT19937-64。
Mersenne Twister算法的原理:Mersenne Twister算法是利用线性反馈移位寄存器(LFSR)产生随机数的,LFSR的反馈函数是寄存器中某些位的简单异或,这些位也称之为抽头序列。一个n位的LFSR能够在重复之前产生
第二个是一个uniform整数生成头文件。
在rng.hpp
中使用了Fisher-Yates置乱算法。
template <class RandomAccessIterator, class RandomGenerator>
inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end,
RandomGenerator* gen) {
typedef typename std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
typedef typename boost::uniform_int<difference_type> dist_type;
difference_type length = std::distance(begin, end);
if (length <= 0) return;
for (difference_type i = length - 1; i > 0; --i) {
dist_type dist(0, i);
std::iter_swap(begin + i, begin + dist(*gen));
}
}
随机化一个迭代器
template <class RandomAccessIterator>
inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end) {
shuffle(begin, end, caffe_rng());
}
##初始化方法模板定义于头文件filler.hpp
##Base class 的定义
/// @brief Fills a Blob with constant or randomly-generated data.
template <typename Dtype>
class Filler {
public:
explicit Filler(const FillerParameter& param) : filler_param_(param) {}
virtual ~Filler() {}
virtual void Fill(Blob<Dtype>* blob) = 0;
protected:
FillerParameter filler_param_;
}; // class Filler
##常数初始化
/// @brief Fills a Blob with constant values @f$ x = 0 @f$.
template <typename Dtype>
class ConstantFiller : public Filler<Dtype> {
public:
explicit ConstantFiller(const FillerParameter& param)
: Filler<Dtype>(param) {}
virtual void Fill(Blob<Dtype>* blob) {
Dtype* data = blob->mutable_cpu_data();
const int count = blob->count();
const Dtype value = this->filler_param_.value();
CHECK(count);
for (int i = 0; i < count; ++i) {
data[i] = value;
}
CHECK_EQ(this->filler_param_.sparse(), -1)
<< "Sparsity not supported by this Filler.";
}
};
##Uniform 初始化
/// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
template <typename Dtype>
class UniformFiller : public Filler<Dtype> {
public:
explicit UniformFiller(const FillerParameter& param)
: Filler<Dtype>(param) {}
virtual void Fill(Blob<Dtype>* blob) {
CHECK(blob->count());
caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
CHECK_EQ(this->filler_param_.sparse(), -1)
<< "Sparsity not supported by this Filler.";
}
};
##高斯初始化
/** @brief Fills a Blob with values @f$ x \in [0, 1] @f$
* such that @f$ \forall i \sum_j x_{ij} = 1 @f$.
*/
template <typename Dtype>
class PositiveUnitballFiller : public Filler<Dtype> {
public:
explicit PositiveUnitballFiller(const FillerParameter& param)
: Filler<Dtype>(param) {}
virtual void Fill(Blob<Dtype>* blob) {
Dtype* data = blob->mutable_cpu_data();
DCHECK(blob->count());
caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
// We expect the filler to not be called very frequently, so we will
// just use a simple implementation
int dim = blob->count() / blob->num();
CHECK(dim);
for (int i = 0; i < blob->num(); ++i) {
Dtype sum = 0;
for (int j = 0; j < dim; ++j) {
sum += data[i * dim + j];
}
for (int j = 0; j < dim; ++j) {
data[i * dim + j] /= sum;
}
}
CHECK_EQ(this->filler_param_.sparse(), -1)
<< "Sparsity not supported by this Filler.";
}
};
##Xavier初始化
Xavier是一个非常有效初始化的方法。
Fills a Blob with values
/**
* @brief Fills a Blob with values @f$ x \sim U(-a, +a) @f$ where @f$ a @f$ is
* set inversely proportional to number of incoming nodes, outgoing
* nodes, or their average.
*
* A Filler based on the paper [Bengio and Glorot 2010]: Understanding
* the difficulty of training deep feedforward neuralnetworks.
*
* It fills the incoming matrix by randomly sampling uniform data from [-scale,
* scale] where scale = sqrt(3 / n) where n is the fan_in, fan_out, or their
* average, depending on the variance_norm option. You should make sure the
* input blob has shape (num, a, b, c) where a * b * c = fan_in and num * b * c
* = fan_out. Note that this is currently not the case for inner product layers.
*
* TODO(dox): make notation in above comment consistent with rest & use LaTeX.
*/
template <typename Dtype>
class XavierFiller : public Filler<Dtype> {
public:
explicit XavierFiller(const FillerParameter& param)
: Filler<Dtype>(param) {}
virtual void Fill(Blob<Dtype>* blob) {
CHECK(blob->count());
int fan_in = blob->count() / blob->num();
int fan_out = blob->count() / blob->channels();
Dtype n = fan_in; // default to fan_in
if (this->filler_param_.variance_norm() ==
FillerParameter_VarianceNorm_AVERAGE) {
n = (fan_in + fan_out) / Dtype(2);
} else if (this->filler_param_.variance_norm() ==
FillerParameter_VarianceNorm_FAN_OUT) {
n = fan_out;
}
Dtype scale = sqrt(Dtype(3) / n);
caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
blob->mutable_cpu_data());
CHECK_EQ(this->filler_param_.sparse(), -1)
<< "Sparsity not supported by this Filler.";
}
};
##MSRA初始化
Fills a Blob with values
Specifically accounts for ReLU nonlinearities.
It fills the incoming matrix by randomly sampling Gaussian data with std = sqrt(2 / n) where n is the fan_in, fan_out, or their average, depending on the variance_norm option. You should make sure the input blob has shape (num, a, b, c) where a * b * c = fan_in and num * b * c = fan_out. Note that this is currently not the case for inner product layers.
/**
* @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where
* @f$ \sigma^2 @f$ is set inversely proportional to number of incoming
* nodes, outgoing nodes, or their average.
*
* A Filler based on the paper [He, Zhang, Ren and Sun 2015]: Specifically
* accounts for ReLU nonlinearities.
*
* Aside: for another perspective on the scaling factor, see the derivation of
* [Saxe, McClelland, and Ganguli 2013 (v3)].
*
* It fills the incoming matrix by randomly sampling Gaussian data with std =
* sqrt(2 / n) where n is the fan_in, fan_out, or their average, depending on
* the variance_norm option. You should make sure the input blob has shape (num,
* a, b, c) where a * b * c = fan_in and num * b * c = fan_out. Note that this
* is currently not the case for inner product layers.
*/
template <typename Dtype>
class MSRAFiller : public Filler<Dtype> {
public:
explicit MSRAFiller(const FillerParameter& param)
: Filler<Dtype>(param) {}
virtual void Fill(Blob<Dtype>* blob) {
CHECK(blob->count());
int fan_in = blob->count() / blob->num();
int fan_out = blob->count() / blob->channels();
Dtype n = fan_in; // default to fan_in
if (this->filler_param_.variance_norm() ==
FillerParameter_VarianceNorm_AVERAGE) {
n = (fan_in + fan_out) / Dtype(2);
} else if (this->filler_param_.variance_norm() ==
FillerParameter_VarianceNorm_FAN_OUT) {
n = fan_out;
}
Dtype std = sqrt(Dtype(2) / n);
caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
blob->mutable_cpu_data());
CHECK_EQ(this->filler_param_.sparse(), -1)
<< "Sparsity not supported by this Filler.";
}
};
##双线性初始化
/*!
@brief Fills a Blob with coefficients for bilinear interpolation.
A common use case is with the DeconvolutionLayer acting as upsampling.
You can upsample a feature map with shape of (B, C, H, W) by any integer factor
using the following proto.
\code
layer {
name: "upsample", type: "Deconvolution"
bottom: "{{bottom_name}}" top: "{{top_name}}"
convolution_param {
kernel_size: {{2 * factor - factor % 2}} stride: {{factor}}
num_output: {{C}} group: {{C}}
pad: {{ceil((factor - 1) / 2.)}}
weight_filler: { type: "bilinear" } bias_term: false
}
param { lr_mult: 0 decay_mult: 0 }
}
\endcode
Please use this by replacing `{{}}` with your values. By specifying
`num_output: {{C}} group: {{C}}`, it behaves as
channel-wise convolution. The filter shape of this deconvolution layer will be
(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K)
interpolation kernel for every channel of the filter identically. The resulting
shape of the top feature map will be (B, C, factor * H, factor * W).
Note that the learning rate and the
weight decay are set to 0 in order to keep coefficient values of bilinear
interpolation unchanged during training. If you apply this to an image, this
operation is equivalent to the following call in Python with Scikit.Image.
\code{.py}
out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
\endcode
*/
template <typename Dtype>
class BilinearFiller : public Filler<Dtype> {
public:
explicit BilinearFiller(const FillerParameter& param)
: Filler<Dtype>(param) {}
virtual void Fill(Blob<Dtype>* blob) {
CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
Dtype* data = blob->mutable_cpu_data();
int f = ceil(blob->width() / 2.);
Dtype c = (blob->width() - 1) / (2. * f);
for (int i = 0; i < blob->count(); ++i) {
Dtype x = i % blob->width();
Dtype y = (i / blob->width()) % blob->height();
data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
}
CHECK_EQ(this->filler_param_.sparse(), -1)
<< "Sparsity not supported by this Filler.";
}
};
##通过不同类型的filler获取实例
template <typename Dtype>
Filler<Dtype>* GetFiller(const FillerParameter& param) {
const std::string& type = param.type();
if (type == "constant") {
return new ConstantFiller<Dtype>(param);
} else if (type == "gaussian") {
return new GaussianFiller<Dtype>(param);
} else if (type == "positive_unitball") {
return new PositiveUnitballFiller<Dtype>(param);
} else if (type == "uniform") {
return new UniformFiller<Dtype>(param);
} else if (type == "xavier") {
return new XavierFiller<Dtype>(param);
} else if (type == "msra") {
return new MSRAFiller<Dtype>(param);
} else if (type == "bilinear") {
return new BilinearFiller<Dtype>(param);
} else {
CHECK(false) << "Unknown filler name: " << param.type();
}
return (Filler<Dtype>*)(NULL);
}