-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathoffpolicy_eval_tdlearning_with_morta.m
57 lines (39 loc) · 1.49 KB
/
offpolicy_eval_tdlearning_with_morta.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
function [ bootql,prog ] = offpolicy_eval_tdlearning_with_morta( qldata3, physpol, ptid, idx, actionbloctrain, Y90, gamma, num_iter )
bootql=cell(num_iter,1);
p=unique(qldata3(:,8));
prop=5000/numel(p); %5000 patients of the samples are used
prop=min([prop 0.75]); %max possible value is 0.75 (75% of the samples are used)
jprog=1;
prog=NaN(floor(size(ptid,1)*1.01*prop*num_iter),4);
ii=qldata3(:,1)==1;
a=qldata3(ii,2);
d=zeros(750,1);
for i=1:750; d(i)=sum(a==i); end % state distribution
fprintf('Progress:\n');
fprintf(['\n' repmat('.',1,num_iter) '\n\n']);
for i=1:num_iter
fprintf('\b|\n');
ii=floor(rand(size(p,1),1)+prop); % select a random sample of trajectories
j=ismember(qldata3(:,8),p(ii==1));
q=qldata3(j==1,1:4);
[Qoff, ~]=OffpolicyQlearning150816( q , gamma, 0.1, 300000);
V=zeros(750,25);
for k=1:750
for j=1:25
V(k,j)=physpol(k,j)*Qoff(k,j);
end
end
Vs =sum(V')';
bootql(i)={nansum(Vs(1:750).*d)/sum(d)};
jj=find(ismember(ptid,p(ii==1)));
for ii=1:numel(jj) % record offline Q value in training set & outcome - for plot
prog(jprog,1)=Qoff(idx(jj(ii)),actionbloctrain(jj(ii)));
prog(jprog,2)=Y90(jj(ii));
prog(jprog,3)=ptid(jj(ii)); %HERE EACH ITERATION GIVES A DIFFERENT PT_ID //// if I just do rep*ptid it bugs and mixes up ids, for ex with id3 x rep 10 = 30 (which already exists)
prog(jprog,4)=i;
jprog=jprog+1;
end
end
bootql=cell2mat(bootql);
prog(jprog:end,:)=[];
end