cyber-security-resources/ai_research/ML_Fundamentals/ai_generated/data/Trust_Region_Policy_Optimization_(TRPO).py
2023-09-04 23:49:06 -04:00

213 lines
8.3 KiB
Python

Sure! Below is an example Python script that demonstrates the Trust Region Policy Optimization (TRPO) algorithm using OpenAI Gym's "CartPole-v1" environment:
```python
import gym
import numpy as np
import tensorflow as tf
class PolicyNetwork:
def __init__(self, state_dim, action_dim, hidden_dim):
self.state_dim = state_dim
self.action_dim = action_dim
self.hidden_dim = hidden_dim
self.states = tf.placeholder(tf.float32, [None, state_dim], name="states")
self.actions = tf.placeholder(tf.int32, [None], name="actions")
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
self.mean_network = self.build_network(scope="mean")
self.sample_network = self.build_network(scope="sample")
self.sampled_actions = self.sample_network(self.states)
self.mean_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="mean")
self.sample_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="sample")
self.policy_loss = self.compute_policy_loss()
self.kl_divergence = self.compute_kl_divergence()
self.gradient = self.compute_gradient()
def build_network(self, scope):
with tf.variable_scope(scope):
hidden_layer = tf.layers.dense(self.states, self.hidden_dim, activation=tf.nn.relu)
output_layer = tf.layers.dense(hidden_layer, self.action_dim)
output_probs = tf.nn.softmax(output_layer)
def network(states):
feed_dict = {self.states: states}
sess = tf.get_default_session()
return sess.run(output_probs, feed_dict=feed_dict)
return network
def compute_policy_loss(self):
indices = tf.range(tf.shape(self.sampled_actions)[0]) * tf.shape(self.sampled_actions)[1] + self.actions
selected_action_probs = tf.gather(tf.reshape(self.sampled_actions, [-1]), indices)
ratio = selected_action_probs / tf.stop_gradient(self.mean_network(self.states))
surrogate_loss = -tf.reduce_mean(ratio * self.advantages)
return surrogate_loss
def compute_kl_divergence(self):
mean_network_probs = self.mean_network(self.states)
sample_network_probs = tf.stop_gradient(self.sampled_actions)
return tf.reduce_mean(tf.reduce_sum(mean_network_probs * tf.log(mean_network_probs / sample_network_probs), axis=1))
def compute_gradient(self):
grads = tf.gradients(self.policy_loss, self.sample_weights)
flat_grads = tf.concat([tf.reshape(grad, [-1]) for grad in grads], axis=0)
return flat_grads
def compute_advantages(rewards, next_value, discount_factor=0.99, gae_lambda=0.95):
values = np.append(rewards, next_value)
deltas = rewards + discount_factor * values[1:] - values[:-1]
advantages = np.zeros_like(rewards)
for t in reversed(range(len(rewards))):
delta = deltas[t]
advantages[t] = delta + discount_factor * gae_lambda * advantages[t+1]
return advantages
def run_episode(env, policy_network, render=False):
states, actions, rewards = [], [], []
state = env.reset()
while True:
if render:
env.render()
action_probs = policy_network.sample_network(np.expand_dims(state, axis=0))
action = np.random.choice(len(action_probs[0]), p=action_probs[0])
next_state, reward, done, _ = env.step(action)
states.append(state)
actions.append(action)
rewards.append(reward)
state = next_state
if done:
break
return states, actions, rewards
def train(env, policy_network, max_iterations=1000, max_episode_length=1000, cg_iterations=10, delta=0.01):
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
trainable_variables = tf.trainable_variables()
grads_placeholder = tf.placeholder(tf.float32, shape=[None])
flat_grads_and_vars_placeholder = tf.placeholder(tf.float32, shape=[None])
grads = tf.gradients(policy_network.kl_divergence, trainable_variables)
grads_placeholder_and_vars = list(zip(grads_placeholder, trainable_variables))
flat_grads_and_vars_placeholder_and_vars = list(zip(flat_grads_and_vars_placeholder, trainable_variables))
compute_grads = tf.train.AdamOptimizer(learning_rate=1e-3).apply_gradients(grads_placeholder_and_vars)
compute_flat_grad = flatten_gradients(grads)
apply_flat_grad = unflatten_gradients(flat_grads_and_vars_placeholder, trainable_variables)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
for iteration in range(max_iterations):
episode_states, episode_actions, episode_rewards = run_episode(env, policy_network)
episode_advantages = compute_advantages(episode_rewards, 0)
episode_mean = np.mean(episode_rewards)
episode_std = np.std(episode_rewards)
feed_dict = {
policy_network.states: np.array(episode_states),
policy_network.actions: np.array(episode_actions),
policy_network.advantages: episode_advantages
}
# Update policy network
sess.run(optimizer.minimize(policy_network.policy_loss), feed_dict=feed_dict)
# Update value function (critic network)
for _ in range(cg_iterations):
sess.run(compute_flat_grad, feed_dict=feed_dict)
flat_grads = sess.run(compute_flat_grad, feed_dict=feed_dict)
feed_dict[flat_grads_placeholder] = flat_grads
step_direction = conjugate_gradients(sess, compute_flat_grad, feed_dict)
step_size = np.sqrt(2 * delta / (np.dot(step_direction, hessian_vector_product(sess, state, compute_flat_grad, feed_dict)) + 1e-8))
feed_dict[flat_grads_and_vars_placeholder] = step_size * step_direction
sess.run(apply_flat_grad, feed_dict=feed_dict)
sess.close()
def conjugate_gradients(sess, compute_flat_grad_fn, feed_dict, cg_iterations=10, residual_tol=1e-10):
x = np.zeros_like(np.array(feed_dict).flatten())
b = sess.run(compute_flat_grad_fn, feed_dict=feed_dict)
r = b.copy()
p = b.copy()
rsold = np.dot(r, r)
for _ in range(cg_iterations):
Ap = sess.run(compute_flat_grad_fn, feed_dict={x: p})
alpha = rsold / (np.dot(p, Ap) + 1e-8)
x += alpha * p
r -= alpha * Ap
rsnew = np.dot(r, r)
if np.sqrt(rsnew) < residual_tol:
break
p = r + (rsnew / rsold) * p
rsold = rsnew
return x
def hessian_vector_product(sess, state, compute_flat_grad_fn, feed_dict, damping=0.1):
grads = sess.run(compute_flat_grad_fn, feed_dict=feed_dict)
constraints = tf.placeholder(tf.float32, shape=[None])
compute_kl_grads = tf.gradients(policy_network.kl_divergence, trainable_variables)
gradient_products = tf.reduce_sum(compute_kl_grads * constraints)
feed_dict.update({constraints: grads})
return sess.run(gradient_products, feed_dict=feed_dict)
def flatten_gradients(grads):
flat_grads = []
for grad in grads:
flat_grads.append(tf.reshape(grad, [-1]))
return tf.concat(flat_grads, axis=0)
def unflatten_gradients(grads_placeholder, trainable_variables):
grads = []
start = 0
for var in trainable_variables:
var_shape = var.shape.as_list()
var_size = np.prod(var_shape)
grads.append(tf.reshape(grads_placeholder[start:start+var_size], var_shape))
start += var_size
return grads
def main():
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
hidden_dim = 32
policy_network = PolicyNetwork(state_dim, action_dim, hidden_dim)
train(env, policy_network, max_iterations=100)
env.close()
if __name__ == "__main__":
main()
```
In this script, the TRPO algorithm is used to optimize a policy network to solve the CartPole-v1 environment from the Gym library. The `PolicyNetwork` class defines the policy network, and the `train` function implements the TRPO algorithm to train the network. The `compute_advantages`, `run_episode`, `conjugate_gradients`, `hessian_vector_product`, `flatten_gradients`, and `unflatten_gradients` functions are helper functions used in the training process.
Note that this implementation assumes you have TensorFlow and Gym libraries installed. You may need to install additional dependencies if necessary.