CardPole Reinforcement Learning

My own solution for the CardPole-Challange. I only use the pole-angle, nothing else. I remember a history of it (with memory capacity) With a memory capacity of 1 it does not work, with 2 I need luck. 3 and more look good. I already solved it with all 4 parameters but this did not match my personal experience where I am balancing a pole on my finger. The only thing I observe here is the angle.

import gym
import random
import numpy as np


class HillClimbingAgent():

    def __init__(self, env):
        self.action_size = env.action_space.n
        self.state = [0, 0, 0] # memory capacity
        self.input_size = len(self.state)
        self.W = 1e-4 * np.random.rand(self.action_size, self.input_size)
        self.best_W = np.copy(self.W)
        self.best = -np.inf
        self.noise_scale = 1e-5
        

    def _append_to_state(self, state):
        n = len(self.state)
        for i in range(0, n-1):
            self.state[n-i-1] = self.state[n-i-2]
        self.state[0] = state[2]

    def get_action(self, state):
        self._append_to_state(state)
        action = np.argmax(np.dot(self.W, self.state))
        return action

    def update(self, total_reward):
        if total_reward > self.best:
            self.best = total_reward
            self.best_W = np.copy(self.W)
            inc = self.noise_scale * np.random.rand(self.action_size, self.input_size)
            self.noise_scale = min(self.noise_scale/2, 1e-3)
        else:
            self.noise_scale = max(self.noise_scale * 2, 2)
        
        self.W = self.best_W * self.noise_scale * np.random.rand(self.action_size, self.input_size)

env_name = "CartPole-v1"
env = gym.make(env_name)
agent = HillClimbingAgent(env)

for i in range(50):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.get_action(state)
        state, reward, done, info = env.step(action)
        total_reward += reward
        env.render()

    agent.update(total_reward)
    print(f'{i} total_reward: {total_reward}')

    if total_reward == 500:
        break

Toms Brain Dump - My Microblog

Search This Blog

CardPole Reinforcement Learning

Comments

Post a Comment

Popular posts from this blog

Futureproof Software

Choose Projects you do NOT do!

Outside In vs. Inside Out TDD: Both!