# Instalação inicial (se necessário)
!pip install gymnasium torch matplotlib numpy -q
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt
# 1. Ambiente (CartPole - mais simples, sem dependências extras)
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0] # 4 estados
action_dim = env.action_space.n # 2 ações
print(f"Ambiente: CartPole-v1")
print(f"Dimensão do estado: {state_dim}")
print(f"Dimensão da ação: {action_dim}")
# 2. Rede Actor-Critic
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, hidden=64):
super().__init__()
self.common = nn.Sequential(
nn.Linear(state_dim, hidden),
nn.ReLU()
)
self.actor = nn.Sequential(
nn.Linear(hidden, hidden//2),
nn.ReLU(),
nn.Linear(hidden//2, action_dim)
)
self.critic = nn.Sequential(
nn.Linear(hidden, hidden//2),
nn.ReLU(),
nn.Linear(hidden//2, 1)
)
def forward(self, x):
common_out = self.common(x)
logits = self.actor(common_out)
value = self.critic(common_out)
return logits, value
# 3. Agente Policy Gradient (Actor-Critic)
class PolicyGradientAgent:
def __init__(self, state_dim, action_dim, lr=0.001, gamma=0.99, entropy_coef=0.01):
self.network = ActorCritic(state_dim, action_dim)
self.optimizer = optim.Adam(self.network.parameters(), lr=lr)
self.gamma = gamma
self.entropy_coef = entropy_coef
self.log_probs = []
self.values = []
self.rewards = []
self.dones = []
self.states = []
def act(self, state):
state_t = torch.FloatTensor(state).unsqueeze(0)
logits, value = self.network(state_t)
dist = Categorical(logits=logits)
action = dist.sample()
log_prob = dist.log_prob(action)
self.log_probs.append(log_prob)
self.values.append(value.squeeze())
return action.item()
def remember(self, reward, done):
self.rewards.append(reward)
self.dones.append(done)
def store_state(self, state):
self.states.append(state.copy() if hasattr(state, 'copy') else state)
def learn(self):
# Calcular retornos (G_t)
returns = []
R = 0
for reward, done in zip(reversed(self.rewards), reversed(self.dones)):
if done:
R = 0
R = reward + self.gamma * R
returns.insert(0, R)
returns = torch.tensor(returns).detach()
# Normalizar retornos para estabilidade
if len(returns) > 1 and returns.std() > 1e-8:
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
log_probs = torch.stack(self.log_probs)
values = torch.stack(self.values)
# Vantagem = Retorno - Valor estimado
advantages = returns - values.detach()
# Perda do ator: -log_prob * vantagem
actor_loss = -(log_probs * advantages).mean()
# Perda do crítico: MSE entre valor e retorno
critic_loss = nn.MSELoss()(values, returns)
# Entropia para exploração
if len(self.states) > 0:
state_t = torch.FloatTensor(np.array(self.states)).detach()
with torch.no_grad():
logits, _ = self.network(state_t)
probs = torch.softmax(logits, dim=-1)
entropy = -(probs * torch.log(probs + 1e-8)).sum(dim=1).mean()
else:
entropy = torch.tensor(0.0)
loss = actor_loss + critic_loss - self.entropy_coef * entropy
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# Limpar memória do episódio
self.log_probs = []
self.values = []
self.rewards = []
self.dones = []
self.states = []
return actor_loss.item(), critic_loss.item(), entropy.item()
# 4. Treinamento
agent = PolicyGradientAgent(state_dim, action_dim)
episodes = 300
reward_history = []
actor_losses = []
critic_losses = []
entropies = []
print("\n🚀 Treinando o agente no CartPole...")
print("O objetivo é equilibrar o poste por 500 passos.\n")
for ep in range(episodes):
state, _ = env.reset()
total_reward = 0
agent.states = []
step_count = 0
while True:
agent.store_state(state)
action = agent.act(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
agent.remember(reward, done)
total_reward += reward
state = next_state
step_count += 1
if done:
break
a_loss, c_loss, ent = agent.learn()
reward_history.append(total_reward)
actor_losses.append(a_loss)
critic_losses.append(c_loss)
entropies.append(ent)
# Mostrar progresso a cada 50 episódios
if (ep + 1) % 50 == 0:
avg_reward = np.mean(reward_history[-50:]) if len(reward_history) >= 50 else np.mean(reward_history)
print(f"Episódio {ep+1:3d}/{episodes} | Média recompensa (últimos 50): {avg_reward:6.2f} | Entropia: {ent:.3f}")
env.close()
# 5. Gráficos
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.plot(reward_history, alpha=0.7, linewidth=0.8)
plt.title('Recompensa por Episódio', fontsize=12)
plt.xlabel('Episódio')
plt.ylabel('Passos equilibrados')
plt.grid(True, alpha=0.3)
plt.subplot(1, 3, 2)
plt.plot(actor_losses, label='Ator (Policy)', alpha=0.7, linewidth=0.8)
plt.plot(critic_losses, label='Crítico (Value)', alpha=0.7, linewidth=0.8)
plt.title('Perdas do Ator e Crítico', fontsize=12)
plt.xlabel('Episódio')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(1, 3, 3)
plt.plot(entropies, color='green', alpha=0.7, linewidth=0.8)
plt.title('Entropia da Política', fontsize=12)
plt.xlabel('Episódio')
plt.ylabel('Entropia (exploração)')
plt.grid(True, alpha=0.3)
plt.suptitle('Aprendizado por Reforço - Método Baseado em Política (Actor-Critic)', fontsize=14)
plt.tight_layout()
plt.show()
print("\n✅ Treinamento concluído!")
print(f"Melhor pontuação: {max(reward_history):.0f} passos")
print(f"Pontuação média final: {np.mean(reward_history[-50:]):.2f} passos")