openai gym cartpole problem

by: tonyironstark, 7 years ago

Last edited: 7 years ago

hi,
i was working on the cartpole problem from the openai gym following your tutorial and i was converting your abstracted tflearn code to simple tensorflow code following all your tutorials.
but it seems that the weights are not converging at all. the model is similar's the the one in the tutorial. i have also applied regularization top of your code to avoid any overfitting but it seems it is always choosing one option in favor of another.
Please help me out.
[/
import numpy as np
import tensorflow as tf
import gym
import os
import random

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
model_path = "C:/Users/sanka/codes/cart pole problem/tf_save3"
env = gym.make("CartPole-v0")
env.reset()


def train_set():
    try:
        tx = np.load("final_trainx.npy")
        ty = np.load("final_trainy.npy")
        return tx,ty
    except:
        tx = []
        ty = []
        for _ in range(10000):
            env.reset()
            score = 0
            moves = []
            obs = []
            p = []
            for _ in range(500):
                action = np.random.randint(0, 2)
                observation, reward, done, info = env.step(action)
                if (len(p)==0):
                    p = observation
                else:
                    moves += [action]
                    obs += [observation]
                    p = observation
                score += reward
                if done:
                    break
            if (score > 50):
                tx+=obs
                for i in range(len(moves)):
                    ac = moves[i]
                    if (ac == 1):
                        ty.append([0, 1])
                    else:
                        ty.append([1, 0])
        tx=np.array(tx)
        ty=np.array(ty)
        np.save("final_trainx.npy",tx)
        np.save("final_trainy.npy",ty)
        return tx, ty


weights = {
    1: tf.Variable(tf.truncated_normal([4, 128]), dtype=tf.float32),
    2: tf.Variable(tf.truncated_normal([128, 256]), dtype=tf.float32),
    3: tf.Variable(tf.truncated_normal([256, 512]), dtype=tf.float32),
    4: tf.Variable(tf.truncated_normal([512, 256]), dtype=tf.float32),
    5: tf.Variable(tf.truncated_normal([256, 128]), dtype=tf.float32),
    6: tf.Variable(tf.truncated_normal([128, 2]), dtype=tf.float32)
}

biases = {
    1: tf.Variable(tf.truncated_normal([128]), dtype=tf.float32),
    2: tf.Variable(tf.truncated_normal([256]), dtype=tf.float32),
    3: tf.Variable(tf.truncated_normal([512]), dtype=tf.float32),
    4: tf.Variable(tf.truncated_normal([256]), dtype=tf.float32),
    5: tf.Variable(tf.truncated_normal([128]), dtype=tf.float32),
    6: tf.Variable(tf.truncated_normal([2]), dtype=tf.float32)
}


def neural_network(x):
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[1]), biases[1]))
    x = tf.nn.dropout(x, 0.8)
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[2]), biases[2]))
    x = tf.nn.dropout(x, 0.8)
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[3]), biases[3]))
    x = tf.nn.dropout(x, 0.8)
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[4]), biases[4]))
    x = tf.nn.dropout(x, 0.8)
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[5]), biases[5]))
    x = tf.nn.dropout(x, 0.8)
    x = tf.add(tf.matmul(x, weights[6]), biases[6])
    return x


def test_nn(x):
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[1]), biases[1]))
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[2]), biases[2]))
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[3]), biases[3]))
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[4]), biases[4]))
    x = tf.nn.relu(tf.add(tf.matmul(x, weights[5]), biases[5]))
    x = tf.nn.softmax(tf.add(tf.matmul(x, weights[6]), biases[6]))
    return x


def train_nn():
    prediction = neural_network(x)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
    lo=tf.nn.l2_loss(weights[1])+tf.nn.l2_loss(weights[2])+tf.nn.l2_loss(weights[3])+tf.nn.l2_loss(weights[4])+tf.nn.l2_loss(weights[5])+tf.nn.l2_loss(weights[6])
    loss=tf.reduce_mean(loss+0.01*lo)
    optimizer = tf.train.AdamOptimizer().minimize(loss)
    test_pred = test_nn(x)
    correct = tf.equal(tf.argmax(test_pred, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        epoches = 5
        batch_size = 100
        for j in range(epoches):
            ep_loss=0
            for i in range(0,len(train_x),batch_size):
                epoch_x=train_x[i:min(i+batch_size,len(train_x))]
                epoch_y = train_y[i:min(i + batch_size, len(train_y))]
                _,c=sess.run([optimizer,loss],feed_dict={x:epoch_x,y:epoch_y})
                ep_loss+=c
                #print("Accuracy is {0}".format(sess.run(accuracy, feed_dict={x: epoch_x, y: epoch_y})))
            print("epoch {0} completed out of {1} with loss {2}".format(j,epoches,ep_loss))
            print("Accuracy is {0}".format(sess.run(accuracy,feed_dict={x:train_x,y:train_y})))

        scores = []
        choices = []
        for each_game in range(10):
            print("game ", each_game)
            score = 0
            game_memory = []
            prev_obs = []
            env.reset()
            for _ in range(500):
                env.render()
                if (len(prev_obs) == 0):
                    action = random.randrange(0, 2)
                else:
                    x1 = np.array([prev_obs]).reshape(-1,4)
                    a = tf.argmax(test_pred, 1)
                    action = sess.run(a, feed_dict={x: x1})
                    action=action[0]

                choices.append(action)
                new_observation, reward, done, info = env.step(action)
                prev_obs = new_observation
                game_memory.append([new_observation, action])
                score += reward
                if done:
                    break

            scores.append(score)

        print('Average Score:', sum(scores) / len(scores))
        print('choice 1:{}  choice 0:{}'.format(choices.count(1) / len(choices), choices.count(0) / len(choices)))



train_x,train_y=train_set()
print(train_x.shape)
print(train_y.shape)
x=tf.placeholder(tf.float32,[None,4])
y=tf.placeholder(tf.int32,[None,2])
train_nn()

]



You must be logged in to post. Please login or register an account.