/** * GQ(lambda): off-policy learning algorithm * @author Adam White */ public class GQLambda { double[] theta; //learning weights double[] w; //learning weights double[] e; //elegibility trace vector int n; public GQLambda(int n) { theta = new double[n]; w = new double[n]; e = new double[n]; this.n = n; } /** * Inputs:: * phi - feature vector corresponding to action a_t in state s_t * phi_next - expected next state feature vector corresponding to a \in A and s_t+1 * lambda - elegibility trace parameter [0,1] * gamma - discount factor [0,1] * z - outcome reward * r - transient reward * rho - ratio of target policy to behaviour policy [0,1] * I - set of interest for s_t, a_t [0,1] **/ public void GQLearn(double[] phi, double[] phi_next, double lambda, double gamma, double z, double r, double rho, double I) { double alpha = 0.0001, eta = 1.0; //step size parameter double delta; delta = r + (1-gamma)*z + gamma*dot(theta,phi_next) - dot(theta,phi); for(int i=0;i