[net]
# how many images are in each batch to average the loss over?
batch=1
# into how many sub-batches shall each batch be divided to handle images in each sub-batch in parallel? 
subdivisions=1
# input size of the network
height=448
width=448
channels=3
# learning parameters
momentum=0.9
decay=0.0005
b_debug=0

# base learning rate
learning_rate=0.001
# change learning rate after the corresponding steps
policy=steps
# need to have as many steps as scale
steps=200,400,600,20000,30000
# re-scale the current learning rate by the correponding factor once the number of steps is reached
scales=2.5,2,2,.1,.1
# max number of "iterations"
max_batches = 40000
# snapshow the learned weights after every k "iterations"
i_snapshot_iteration=1000
#
#
c_ending_gt_files=.txt

#######
# the following part is identical to the first layers of the classification network specified in extraction.cfg
# thereby, we can use the learned filters from that network as initialization for training the detection network specified here
#
[convolutional]
filters=64
size=7
stride=2
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
filters=192
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=1024
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=1024
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=1024
size=3
stride=1
pad=1
activation=leaky

# here ends the part copied from extract.cfg (in that network, only 1 x avgpool, 1x fully connect and a soft max are added afterwards)
#
#######
#
# here starts the new part specifically added for our detection network
# 4 conv layers to learn "detectin-specific" high-level patterns
# 2 fully connected layers
# 1 detection layers (basically also fully connected, but outputs have specific interpretation)

[convolutional]
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
size=3
stride=2
pad=1
filters=1024
activation=leaky

[convolutional]
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
size=3
stride=1
pad=1
filters=1024
activation=leaky

[connected]
output=4096
activation=leaky

[connected]
#needs to be side*side*((1 + l.coords)*l.n + l.classes) as defined in the first convolutional layer
# l.coords -- 4 coordinates to specify a bounding box
# l.n -- how many bounding boxes are predicted from each cell
# e.g., 20 classes, side 7 -> 1470
# e.g., 20 class, side 9 -> 2430
output= 1470
activation=linear

[detection]
# 20 for pascal voc (l.classes)
classes=20
# bounding boxes -> 4 parameters (l.n)
coords=4
rescore=1
b_debug=1
# number of cell in x and y direction
side=7
# number of predicted boxes per cell
num=2
softmax=0
# adapt loss function on bounding box width and height values
sqrt=1
jitter=.2

# relative weights for the combined loss function
object_scale=1
noobject_scale=.5
class_scale=1
coord_scale=5