0

I am trying to convert the Java Code to Python Code and i have done it so far. Java Code works but Python Code doesn't work. Please help me.

Python Code

import random


class QLearning():
    alpha = 0.1
    gamma = 0.9

    state_a = 0
    state_b = 1
    state_c = 2
    state_d = 3
    state_e = 4
    state_f = 5

    states_count = 6

    states = [state_a, state_b, state_c, state_d, state_e, state_f]

    R = [[0 for x in range(states_count)] for x in range(states_count)]
    Q = [[0 for x in range(states_count)] for x in range(states_count)]

    action_from_a = [state_b, state_d]
    action_from_b = [state_a, state_c, state_e]
    action_from_c = [state_c]
    action_from_d = [state_a, state_e]
    action_from_e = [state_b, state_d, state_f]
    action_from_f = [state_c, state_e]

    actions = [action_from_a, action_from_b, action_from_c, action_from_d, action_from_e, action_from_f]

    state_names = ["A","B","C","D","E","F"]

    def __init__(self):
        self.R[self.state_b][self.state_c] = 100
        self.R[self.state_f][self.state_c] = 100

    def run(self):
        for i in range(1000):
            state = random.randrange(self.states_count)
            while(state != self.state_c):
                actions_from_state = self.actions[state]
                index = random.randrange(len(actions_from_state))
                action = actions_from_state[index]
                next_state = action
                q = self.Q_Value(state, action)
                max_Q = self.max_q(next_state)
                r = self.R_Value(state, action)

                value = q + self.alpha * (r + self.gamma * max_Q - q)
                self.set_q(state, action, value)
                state = next_state

    def max_q(self, s):
        self.run().actions_from_state = self.actions[s]
        max_value = 5
        for i in range(len(self.run().actions_from_state)):
            self.run().next_state = self.run().actions_from_state[i]
            self.run().value = self.Q[s][self.run().next_state]

            if self.run().value > max_value:
                max_value = self.run().value
        return max_value

    def policy(self, state):
        self.run().actions_from_state = self.actions[state]
        max_value = 5
        policy_goto_state = state
        for i in range(len(self.run().actions_from_state)):
            self.run().next_state = self.run().actions_from_state[i]
            self.run().value = self.Q[state][self.run().next_state]

            if self.run().value > max_value:
                max_value = self.run().value
                policy_goto_state = self.run().next_state
        return policy_goto_state

    def Q_Value(self, s,a):
        return self.Q[s][a]

    def set_q(self, s, a, value):
        self.Q[s][a] = value

    def R_Value(self, s, a):
        return self.R[s][a]

    def print_result(self):
        print("Print Result")
        for i in range(len(self.Q)):
            print("Out From (0)".format(self.state_names[i]))
            for j in range(len(self.Q[i])):
                print(self.Q[i][j])

    def show_policy(self):
        print("Show Policy")
        for i in range(len(self.states)):
            fro = self.states[i]
            to = self.policy(fro)
            print("From {0} goto {1}".format(self.state_names[fro], self.state_names[to]))

obj = QLearning()
obj.run()
obj.print_result()
obj.show_policy()

Java Code

import java.text.DecimalFormat;
import java.util.Random;

public class Qlearning {
    final DecimalFormat df = new DecimalFormat("#.##");

    // path finding
    final double alpha = 0.1;
    final double gamma = 0.9;


// states A,B,C,D,E,F
// e.g. from A we can go to B or D
// from C we can only go to C
// C is goal state, reward 100 when B->C or F->C
//
// _______
// |A|B|C|
// |_____|
// |D|E|F|
// |_____|
//

    final int stateA = 0;
    final int stateB = 1;
    final int stateC = 2;
    final int stateD = 3;
    final int stateE = 4;
    final int stateF = 5;

    final int statesCount = 6;
    final int[] states = new int[]{stateA,stateB,stateC,stateD,stateE,stateF};

    // http://en.wikipedia.org/wiki/Q-learning
    // http://people.revoledu.com/kardi/tutorial/ReinforcementLearning/Q-Learning.htm

    // Q(s,a)= Q(s,a) + alpha * (R(s,a) + gamma * Max(next state, all actions) - Q(s,a))

    int[][] R = new int[statesCount][statesCount]; // reward lookup
    double[][] Q = new double[statesCount][statesCount]; // Q learning

    int[] actionsFromA = new int[] { stateB, stateD };
    int[] actionsFromB = new int[] { stateA, stateC, stateE };
    int[] actionsFromC = new int[] { stateC };
    int[] actionsFromD = new int[] { stateA, stateE };
    int[] actionsFromE = new int[] { stateB, stateD, stateF };
    int[] actionsFromF = new int[] { stateC, stateE };
    int[][] actions = new int[][] { actionsFromA, actionsFromB, actionsFromC,
            actionsFromD, actionsFromE, actionsFromF };

    String[] stateNames = new String[] { "A", "B", "C", "D", "E", "F" };

    public Qlearning() {
        init();
    }

    public void init() {
        R[stateB][stateC] = 100; // from b to c
        R[stateF][stateC] = 100; // from f to c
    }

    public static void main(String[] args) {
        long BEGIN = System.currentTimeMillis();

        Qlearning obj = new Qlearning();

        obj.run();
        obj.printResult();
        obj.showPolicy();

        long END = System.currentTimeMillis();
        System.out.println("Time: " + (END - BEGIN) / 1000.0 + " sec.");
    }

    void run() {
        /*
         1. Set parameter , and environment reward matrix R
         2. Initialize matrix Q as zero matrix
         3. For each episode: Select random initial state
            Do while not reach goal state o
                Select one among all possible actions for the current state o
                Using this possible action, consider to go to the next state o
                Get maximum Q value of this next state based on all possible actions o
                Compute o Set the next state as the current state
         */

        // For each episode
        Random rand = new Random();
        for (int i = 0; i < 1000; i++) { // train episodes
            // Select random initial state
            int state = rand.nextInt(statesCount);
            while (state != stateC) // goal state
            {
                // Select one among all possible actions for the current state
                int[] actionsFromState = actions[state];

                // Selection strategy is random in this example
                int index = rand.nextInt(actionsFromState.length);
                int action = actionsFromState[index];

                // Action outcome is set to deterministic in this example
                // Transition probability is 1
                int nextState = action; // data structure

                // Using this possible action, consider to go to the next state
                double q = Q(state, action);
                double maxQ = maxQ(nextState);
                int r = R(state, action);

                double value = q + alpha * (r + gamma * maxQ - q);
                setQ(state, action, value);

                // Set the next state as the current state
                state = nextState;
            }
        }
    }

    double maxQ(int s) {
        int[] actionsFromState = actions[s];
        double maxValue = Double.MIN_VALUE;
        for (int i = 0; i < actionsFromState.length; i++) {
            int nextState = actionsFromState[i];
            double value = Q[s][nextState];

            if (value > maxValue)
                maxValue = value;
        }
        return maxValue;
    }

    // get policy from state
    int policy(int state) {
        int[] actionsFromState = actions[state];
        double maxValue = Double.MIN_VALUE;
        int policyGotoState = state; // default goto self if not found
        for (int i = 0; i < actionsFromState.length; i++) {
            int nextState = actionsFromState[i];
            double value = Q[state][nextState];

            if (value > maxValue) {
                maxValue = value;
                policyGotoState = nextState;
            }
        }
        return policyGotoState;
    }

    double Q(int s, int a) {
        return Q[s][a];
    }

    void setQ(int s, int a, double value) {
        Q[s][a] = value;
    }

    int R(int s, int a) {
        return R[s][a];
    }

    void printResult() {
        System.out.println("Print result");
        for (int i = 0; i < Q.length; i++) {
            System.out.print("out from " + stateNames[i] + ":  ");
            for (int j = 0; j < Q[i].length; j++) {
                System.out.print(df.format(Q[i][j]) + " ");
            }
            System.out.println();
        }
    }

    // policy is maxQ(states)
    void showPolicy() {
        System.out.println("\nshowPolicy");
        for (int i = 0; i < states.length; i++) {
            int from = states[i];
            int to =  policy(from);
            System.out.println("from "+stateNames[from]+" goto "+stateNames[to]);
        }
    }
}

Traceback

C:\Python33\python.exe "C:/Users/Ajay/Documents/Python Scripts/RL/QLearning.py"
Traceback (most recent call last):
  File "C:/Users/Ajay/Documents/Python Scripts/RL/QLearning.py", line 4, in <module>
    class QLearning():
  File "C:/Users/Ajay/Documents/Python Scripts/RL/QLearning.py", line 19, in QLearning
    R = [[0 for x in range(states_count)] for x in range(states_count)]
  File "C:/Users/Ajay/Documents/Python Scripts/RL/QLearning.py", line 19, in <listcomp>
    R = [[0 for x in range(states_count)] for x in range(states_count)]
NameError: global name 'states_count' is not defined
ajknzhol
  • 6,322
  • 13
  • 45
  • 72

1 Answers1

2

To access all of the class attributes you define (i.e. everything between class QLearning and def __init__), you need to use self or the class name:

self.states_count

or

QLearning.states_count

I don't know the algorithm, but it is possible that these class attributes should be instance attributes (i.e. separate for each instance of the class, rather than shared amongst all instances) and therefore defined in __init__ (or other instance methods) using self anyway.

jonrsharpe
  • 115,751
  • 26
  • 228
  • 437