Sample code - How to implement Q-learning
const float GAMMA = 0.6;
const float maxQTEMPERATURE = 1.0 / 2;
const float minQTEMPERATURE = 1.0 / 50;
// still retains *somewhat* stochastic policy
// each (x,a) has its own varying ALPHAQ:
float alphaQ ( long int i )
{
return ( 1.0 / i );
}
An Agent is a "behaviour-producing module":
class Agent
{
public:
// keep track of Q-values for my actions:
StateActionSpace Q;
// what really defines me is my reward function
// (defined in subclasses):
virtual float reward ( state x, state y ) { return 0; }
// noQ(x,a) counts the no. of times we have "visited (x,a)"
// (tried out action a in state x)
// so e.g. we can have declining alphaQ:
StateActionSpace noQ;
// the action I suggest to execute:
action ai;
// temporary variable:
action af;
alloc()
{
Q.allocvector ( cf, df );
noQ.allocvector ( cf, df );
ai.allocvector ( df );
af.allocvector ( df );
}
int j;
#define foractions_j for ( j=0; j<=(ai.no-1); j++ )
int randomAction() { return randomAtoB ( 0, ai.no-1 ); }
};
The Learning algorithm - how we update the Q-values:
Agent :: updateQ ( state x, action a, state y )
{
float r = reward(x,y); // defined later
float total = r + (GAMMA * Q.max(y));
noQ.at(x,a)++;
float ALPHA = alphaQ ( noQ.at(x,a) );
Q.at(x,a) = ((1-ALPHA) * Q.at(x,a)) + (ALPHA * total);
}
How to use a Boltzmann "soft max" control policy
with variable temperature:
// first, the sum of the exp(Q/T) terms:
Agent :: calculateSigma ( state x, float QTEMPERATURE )
{
sigma = 0;
foractions_j
{
af.from(j);
sigma = sigma + exp ( Q.at(x,af)/QTEMPERATURE );
}
}
// can show how probable each action is to be tried:
Agent :: printProb ( ostream& stream, state x, float QTEMPERATURE )
{
foractions_j
{
af.from(j);
double prob = exp ( Q.at(x,af)/QTEMPERATURE ) / sigma;
}
}
// suggest an action ai:
Agent :: suggestBoltz ( state x, float QTEMPERATURE )
{
calculateSigma ( x, QTEMPERATURE );
float p = random0to1exclusive();
float sum = 0;
j = 0;
while ( sum < p )
{
af.from(j);
double prob = exp ( Q.at(x,af)/QTEMPERATURE ) / sigma;
sum = sum + prob;
j++;
}
// just hit p
ai = af;
}
// suggest action with reasonable (declining) temperature:
Agent :: suggestReasonable ( state x )
{
suggestBoltz ( x, reasonableTemperature() );
}
float Agent :: reasonableTemperature()
{
long int total = noQ.totalNumberOfExperiences();
if ( total >= ceiling )
return minQTEMPERATURE;
else
{
float e = total / ceiling;
return ( minQTEMPERATURE + (1-e)*(maxQTEMPERATURE-minQTEMPERATURE) );
}
}
// no exploration, demo mode:
Agent :: exploit ( state x )
{
suggestBoltz ( x, minQTEMPERATURE );
}
A Creature may contain one or multiple Agents
inside its head:
class Creature
{
protected:
state s; // temporary variables
state x;
state y;
action ak; // each Agent suggests an action ai
// one action ak wins and is executed
virtual observe() {} // observe() fills up state s
virtual execute ( action a ) {}
AgentArray A; // a list of agents 1..n
#define foragents_i for ( int i=1; i<=A.n; i++ )
int randomAgent() { return randomAtoB ( 1, A.n ); }
public:
Creature()
{
s.allocvector ( cf );
x.allocvector ( cf );
y.allocvector ( cf );
ak.allocvector ( df );
}
};
The interact() function:
// interact with the world just once:
Creature :: interact ( int mode )
{
// observe state, each agent suggests an action:
observe(); x = s;
if (mode == _learnQ)
{
ak[1] = randomAction();
}
else if (mode == _exploit)
{
foragents_i
A[i]->exploit(x);
}
else
{
foragents_i
A[i]->suggestReasonable(x);
}
// somehow go through the ai's
// and pick an agent and execute its action:
execute(ak);
// observe new state, all agents learn:
observe(); y = s;
if (mode == _learnQ)
foragents_i
A[i]->updateQ ( x, ak, y );
}