num_runs = 5; %// Number of iterations to run benchmarks
for k = 1:50000
    tic(); elapsed = toc(); %// Warm up tic/toc
end

A = rand(4940);


sublen = 26; %// subset length
nrows = size(A,1); %// number of rows in input matrix
nsubs = nrows/sublen; %// number of subsets

tic
for iter = 1:num_runs
    idx1 = bsxfun(@plus,[1:sublen]',[0:sublen-1]*nrows);%//'# starting block indices
    idx2 = bsxfun(@plus,idx1(:),[0:nsubs-1]*(nrows*sublen+sublen));%// all block indices
    exclude_sum = sum(A(reshape(idx2,sublen,sublen,[])),2); %// block elements summed
    %// (these would be subtracted from the wholesome sum)
    out = sum(A,2) - exclude_sum(:); %// desired output
end
toc, clear idx1 idx2 exclude_sum out

tic
for iter = 1:num_runs
    A(kron(eye(nsubs),ones(sublen))==1)=0;
    out1 = sum(A,2);
end
toc, clear A out1