Recently, I am trying to reproduce your results in the VOT2016 benchmark. By using provided result files, the computed EAO is 0.2905.
Below is the evaluation script. Is there anything I am missing?
function sfc_vot
% *************************************************************
% VOT: Always call exit command at the end to terminate Matlab!
% *************************************************************
cleanup = onCleanup(@() exit() );
% *************************************************************
% VOT: Set random seed to a different value every time.
% *************************************************************
RandStream.setGlobalStream(RandStream('mt19937ar', 'Seed', sum(clock)));
% *************************************************************
% SFC: Set tracking parameters
% *************************************************************
p.numScale = 3;
p.scaleStep = 1.0375;
p.scalePenalty = 0.9745;
p.scaleLR = 0.59; % damping factor for scale update
p.responseUp = 16; % upsampling the small 17x17 response helps with the accuracy
p.windowing = 'cosine'; % to penalize large displacements
p.wInfluence = 0.176; % windowing influence (in convex sum)
p.net = '2016-08-17.net.mat';
%% execution, visualization, benchmark
p.gpus = 1;
p.fout = -1;
%% Params from the network architecture, have to be consistent with the training
p.exemplarSize = 127; % input z size
p.instanceSize = 255; % input x size (search region)
p.scoreSize = 17;
p.totalStride = 8;
p.contextAmount = 0.5; % context amount for the exemplar
p.subMean = false;
%% SiamFC prefix and ids
p.prefix_z = 'a_'; % used to identify the layers of the exemplar
p.prefix_x = 'b_'; % used to identify the layers of the instance
p.prefix_join = 'xcorr';
p.prefix_adj = 'adjust';
p.id_feat_z = 'a_feat';
p.id_score = 'score';
% -------------------------------------------------------------------------------------------------
startup;
% Get environment-specific default paths.
p = env_paths_tracking(p);
% Load ImageNet Video statistics
if exist(p.stats_path,'file')
stats = load(p.stats_path);
else
warning('No stats found at %s', p.stats_path);
stats = [];
end
% Load two copies of the pre-trained network
net_z = load_pretrained([p.net_base_path p.net], p.gpus);
net_x = load_pretrained([p.net_base_path p.net], []);
% Divide the net in 2
% exemplar branch (used only once per video) computes features for the target
remove_layers_from_prefix(net_z, p.prefix_x);
remove_layers_from_prefix(net_z, p.prefix_join);
remove_layers_from_prefix(net_z, p.prefix_adj);
% instance branch computes features for search region x and cross-correlates with z features
remove_layers_from_prefix(net_x, p.prefix_z);
zFeatId = net_z.getVarIndex(p.id_feat_z);
scoreId = net_x.getVarIndex(p.id_score);
% **********************************
% VOT: Get initialization data
% **********************************
[handle, first_image, region] = vot('rectangle');
% If the provided region is a polygon ...
if numel(region) > 4
x1 = round(min(region(1:2:end)));
x2 = round(max(region(1:2:end)));
y1 = round(min(region(2:2:end)));
y2 = round(max(region(2:2:end)));
region = round([x1, y1, x2 - x1, y2 - y1]);
else
region = round([round(region(1)), round(region(2)), ...
round(region(1) + region(3)) - round(region(1)), ...
round(region(2) + region(4)) - round(region(2))]);
end;
irect = region
targetPosition = [irect(2) + (1 + irect(4)) / 2 irect(1) + (1 + irect(3)) / 2];
targetSize = [irect(4) irect(3)];
startFrame = 1;
% get the first frame of the video
im = gpuArray(single(imread(first_image)));
% if grayscale repeat one channel to match filters size
if(size(im, 3)==1)
im = repmat(im, [1 1 3]);
end
% get avg for padding
avgChans = gather([mean(mean(im(:,:,1))) mean(mean(im(:,:,2))) mean(mean(im(:,:,3)))]);
wc_z = targetSize(2) + p.contextAmount*sum(targetSize);
hc_z = targetSize(1) + p.contextAmount*sum(targetSize);
s_z = sqrt(wc_z*hc_z);
scale_z = p.exemplarSize / s_z;
% initialize the exemplar
[z_crop, ~] = get_subwindow_tracking(im, targetPosition, [p.exemplarSize p.exemplarSize], [round(s_z) round(s_z)], avgChans);
if p.subMean
z_crop = bsxfun(@minus, z_crop, reshape(stats.z.rgbMean, [1 1 3]));
end
d_search = (p.instanceSize - p.exemplarSize)/2;
pad = d_search/scale_z;
s_x = s_z + 2*pad;
% arbitrary scale saturation
min_s_x = 0.2*s_x;
max_s_x = 5*s_x;
switch p.windowing
case 'cosine'
window = single(hann(p.scoreSize*p.responseUp) * hann(p.scoreSize*p.responseUp)');
case 'uniform'
window = single(ones(p.scoreSize*p.responseUp, p.scoreSize*p.responseUp));
end
% make the window sum 1
window = window / sum(window(:));
scales = (p.scaleStep .^ ((ceil(p.numScale/2)-p.numScale) : floor(p.numScale/2)));
% evaluate the offline-trained network for exemplar z features
net_z.eval({'exemplar', z_crop});
z_features = net_z.vars(zFeatId).value;
z_features = repmat(z_features, [1 1 1 p.numScale]);
% start tracking
i = startFrame;
while true
% **********************************
% VOT: Get next frame
% **********************************
[handle, image] = handle.frame(handle);
if isempty(image)
break;
end;
if i>startFrame
% load new frame on GPU
im = gpuArray(single(imread(image)));
% if grayscale repeat one channel to match filters size
if(size(im, 3)==1)
im = repmat(im, [1 1 3]);
end
scaledInstance = s_x .* scales;
scaledTarget = [targetSize(1) .* scales; targetSize(2) .* scales];
% extract scaled crops for search region x at previous target position
x_crops = make_scale_pyramid(im, targetPosition, scaledInstance, p.instanceSize, avgChans, stats, p);
% evaluate the offline-trained network for exemplar x features
[newTargetPosition, newScale] = tracker_eval(net_x, round(s_x), scoreId, z_features, x_crops, targetPosition, window, p);
targetPosition = gather(newTargetPosition);
% scale damping and saturation
s_x = max(min_s_x, min(max_s_x, (1-p.scaleLR)*s_x + p.scaleLR*scaledInstance(newScale)));
targetSize = (1-p.scaleLR)*targetSize + p.scaleLR*[scaledTarget(1,newScale) scaledTarget(2,newScale)];
else
% at the first frame output position and size passed as input (ground truth)
end
i = i + 1;
rectPosition = [targetPosition([2,1]) - targetSize([2,1])/2, targetSize([2,1])];
% output bbox in the original frame coordinates
oTargetPosition = targetPosition; % .* frameSize ./ newFrameSize;
oTargetSize = targetSize; % .* frameSize ./ newFrameSize;
region = [oTargetPosition([2,1]) - oTargetSize([2,1])/2, oTargetSize([2,1])];
% **********************************
% VOT: Report position for frame
% **********************************
handle = handle.report(handle, region);
end
% **********************************
% VOT: Output the results
% **********************************
handle.quit(handle);
end