% Question: How much data does it take to resolve slope of classification boundary to
% within 1% error? You can generate statistics for slope even though the
% model doesn't include this variable.
% Question: How does number of data points effect mixing rate and number of
% accepts/rejects?
close all;
n = 500; % number of data samples for model fitting: 50-5000
kappa = 5; % condition number of data sets
sigma = 5; % the standard deviation of the proposal distribution. How does this effect the acceptance rate?
iters = 10000; % number of MCMC samples to draw
% create classification problem dataset
[D,l] = create_classification_problem(n,2,kappa);
% log-likelihood for logistic regression
A = -diag(l)*D;
ll = @(x) -sum(log(1+exp(A*x)));
% starting point (may not be close to center of distribution)
x = [0,0]';
% perform MCMC
reject_count=0;
accept_count=0;
samps = zeros(iters,2);
slopes = zeros(iters,1);
for i = 1:iters
y = x+sigma*randn(2,1); % make a proposal: try decreasing sigma when there's a lot of data to boost acceptance rate
alpha = exp(ll(y)-ll(x));% accept the proposal with probability alpha
if rand()0,1),D(l>0,2),'r');
hold on;
scatter(D(l<0,1),D(l<0,2),'b');
hold off;
title('data');
% Plot samples
subplot(2,2,2)
scatter(samps(:,1),samps(:,2),'.')
title('samps');
xlabel('x coefficient');
ylabel('y coefficient');
% Plot histogram of x-coefficients
subplot(2,2,3)
hist(samps(:,1),100)
title('x coefficient');
% Plot histogram of y-coefficients
subplot(2,2,4)
hist(samps(:,2),100)
title('y coefficient');