Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 40 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,33 +1,51 @@
# 参赛须知

* 自定义Docker-Agent算法
* 使用您的算法,修改 examples/docker-agent/train.py 函数并且训练和保存模型;
* 使用您的模型与算法,完善run.py中Agent类的act函数
* Note:若您使用了额外的python包,请在requirements.txt添加附加依赖。
- 下载Playground环境以及配置对应虚拟环境

* 测试您的算法
- 下载环境以及example

* 进入下载的playground文件夹,配置pommerman
```bash
git clone [email protected]:mail-ecnu/example_playground.git
```

```bash
cd playground
conda env create -f env.yml
conda activate pommerman
```

* Note:训练前请完成以上步骤
- 配置环境:

* 安装对应docker镜像中的agent。这个安装过程比较长,需要下载比较多东西。
- Note:若您使用了额外的python包,请在requirements.txt添加附加依赖。
- 在example_playground文件夹,配置pommerman

```bash
docker build -t pommerman/simple-agent -f examples/docker-agent/Dockerfile .
```
```bash
cd example_playground
conda env create -f env.yml
conda activate pommerman
```

* 运行比赛
- 自定义Docker-Agent

```
python examples/simple_ffa_run.py
```
- 参考[main分支](https://github.com/mail-ecnu/example_playground)中example_playground/examples/docker-agent下的train.py,run.py两个文件或者参考[A2C分支](https://github.com/mail-ecnu/example_playground/tree/A2C)下example_playground/examples/docker-agent/A2C/main.py文件完成你的算法。(两个实例代码都包含了完整的训练代码)
- 你的Agent类需要继承agents.BaseAgent,必须要完善的是agent的act方法,在这里完善你的policy。
- 完善你的算法,使用您的算法,训练和保存模型;

您可以通过修改和输出examples/simple_ffa_run.py中的参数测试您的算法性能。
- 测试您的算法

- 提交代码前可以进行本地测试保证你的代码可以成功运行。

- 根据dockerfile生成镜像。

```bash
sudo docker build -t pommerman/simple-agent -f examples/docker-agent/Dockerfile .
```

- 运行比赛

```
sudo python examples/simple_ffa_run.py
```

您可以通过修改examples/simple_ffa_run.py中的参数测试您的算法性能。

如果docker运行失败需要多次运行以上代码。

- 提交您的代码

- 在校园网环境下访问比赛网页(内部测试阶段)注册账户,在网页的profile中上传你的代码对应的github私钥,然后进行提交。
- 提交页面需要输入三项信息:AgentName,Your repository url,Your dockerfile path,需要注意url必须选择ssh格式。
3 changes: 3 additions & 0 deletions env.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
name: pommerman
channels:
- defaults
- conda-forge
dependencies:
- python=3.7.*
- pip
Expand Down
Binary file added examples/docker-agent/A2C/convrnn-s.weights
Binary file not shown.
233 changes: 233 additions & 0 deletions examples/docker-agent/A2C/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
from threading import Thread
from model import *
import pommerman
import colorama
from pommerman import agents
from collections import Counter
import os
import sys
import time
import random
import math
# import pickle
import os

ROLLOUTS_PER_BATCH = 1
batch = []


class World():
def __init__(self, init_gmodel=True):
if init_gmodel:
self.gmodel = A2CNet(gpu=True) # Global model

self.model = A2CNet(gpu=False) # Agent (local) model
self.leif = Leif(self.model)
self.stoner = Stoner()

self.agent_list = [
self.leif,
# self.stoner
agents.SimpleAgent(),
agents.SimpleAgent(),
agents.SimpleAgent()
]
self.env = normal_env(self.agent_list) # naked_env
fmt = {
'int': self.color_sign,
'float': self.color_sign
}
np.set_printoptions(formatter=fmt, linewidth=300)
pass

def color_sign(self, x):
if x == 0:
c = colorama.Fore.LIGHTBLACK_EX
elif x == 1:
c = colorama.Fore.BLACK
elif x == 2:
c = colorama.Fore.BLUE
elif x == 3:
c = colorama.Fore.RED
elif x == 4:
c = colorama.Fore.RED
elif x == 10:
c = colorama.Fore.YELLOW
else:
c = colorama.Fore.WHITE
x = '{0: <2}'.format(x)
return f'{c}{x}{colorama.Fore.RESET}'


def do_rollout(env, leif, do_print=False):
done, state = False, env.reset()
rewards, dones = [], []
states, actions, hidden, probs, values = leif.clear()

while not done:
if do_print:
time.sleep(0.1)
os.system('clear')
print(state[0]['board'])

action = env.act(state)
state, reward, done, info = env.step(action)
if reward[0] == -1: done = True
rewards.append(reward[0])
dones.append(done)

hidden = hidden[:-1].copy()
hns, cns = [], []
for hns_cns_tuple in hidden:
hns.append(hns_cns_tuple[0])
cns.append(hns_cns_tuple[1])

return (states.copy(),
actions.copy(),
rewards, dones,
(hns, cns),
probs.copy(),
values.copy())


def gmodel_train(gmodel, states, hns, cns, actions, rewards, gae):
states, hns, cns = torch.stack(states), torch.stack(hns, dim=0), torch.stack(cns, dim=0)
gmodel.train()
probs, values, _, _ = gmodel(states.to(gmodel.device), hns.to(gmodel.device), cns.to(gmodel.device), debug=False)

prob = F.softmax(probs, dim=-1)
log_prob = F.log_softmax(probs, dim=-1)
entropy = -(log_prob * prob).sum(1)

log_probs = log_prob[range(0, len(actions)), actions]
advantages = torch.tensor(rewards).to(gmodel.device) - values.squeeze(1)
value_loss = advantages.pow(2) * 0.5
policy_loss = -log_probs * torch.tensor(gae).to(gmodel.device) - gmodel.entropy_coef * entropy

gmodel.optimizer.zero_grad()
pl = policy_loss.sum()
vl = value_loss.sum()
loss = pl + vl
loss.backward()
gmodel.optimizer.step()

return loss.item(), pl.item(), vl.item()


def unroll_rollouts(gmodel, list_of_full_rollouts):
gamma = gmodel.gamma
tau = 1

states, actions, rewards, hns, cns, gae = [], [], [], [], [], []
for (s, a, r, d, h, p, v) in list_of_full_rollouts:
states.extend(torch.tensor(s))
actions.extend(a)
rewards.extend(gmodel.discount_rewards(r))

hns.extend([torch.tensor(hh) for hh in h[0]])
cns.extend([torch.tensor(hh) for hh in h[1]])

# Calculate GAE
last_i, _gae, __gae = len(r) - 1, [], 0
for i in reversed(range(len(r))):
next_val = v[i + 1] if i != last_i else 0
delta_t = r[i] + gamma * next_val - v[i]
__gae = __gae * gamma * tau + delta_t
_gae.insert(0, __gae)

gae.extend(_gae)

return states, hns, cns, actions, rewards, gae


def train(world):
model, gmodel = world.model, world.gmodel
leif, env = world.leif, world.env

if False and os.path.isfile("convrnn-s.weights"):
model.load_state_dict(torch.load("convrnn-s.weights", map_location='cpu'))
gmodel.load_state_dict(torch.load("convrnn-s.weights", map_location='cpu'))

if os.path.exists("training.txt"): os.remove("training.txt")

rr = -1
ii = 0
for i in range(40000):
full_rollouts = [do_rollout(env, leif) for _ in range(ROLLOUTS_PER_BATCH)]
last_rewards = [roll[2][-1] for roll in full_rollouts]
not_discounted_rewards = [roll[2] for roll in full_rollouts]
states, hns, cns, actions, rewards, gae = unroll_rollouts(gmodel, full_rollouts)
gmodel.gamma = 0.5 + 1 / 2. / (1 + math.exp(-0.0003 * (i - 20000))) # adaptive gamma
l, pl, vl = gmodel_train(gmodel, states, hns, cns, actions, rewards, gae)
rr = rr * 0.99 + np.mean(last_rewards) / ROLLOUTS_PER_BATCH * 0.01
ii += len(actions)
print(i, "\t", round(gmodel.gamma, 3), round(rr, 3), "\twins:", last_rewards.count(1), Counter(actions),
round(sum(rewards), 3), round(l, 3), round(pl, 3), round(vl, 3))
with open("training.txt", "a") as f:
print(rr, "\t", round(gmodel.gamma, 4), "\t", round(vl, 3), "\t", round(pl, 3), "\t", round(l, 3), file=f)
model.load_state_dict(gmodel.state_dict())
if i >= 10 and i % 300 == 0: torch.save(gmodel.state_dict(), "convrnn-s.weights")


def run(world):
done, ded, state, _ = False, False, world.env.reset(), world.leif.clear()

while not done:
action = world.env.act(state)
state, reward, done, info = world.env.step(action)
print(world.leif.board_cent)
print(world.leif.bbs_cent)
print(world.leif.bl_cent)
time.sleep(0.2)

world.env.close()
return None


def eval(world, init_gmodel=False):
env = world.env
model = world.model
leif = world.leif
leif.debug = True
leif.stochastic = False

do_print = True
done = None
reward = 0
last_reward = [0, 0, 0, 0]

while True:
model.load_state_dict(torch.load("convrnn-s.weights", map_location='cpu'))

done, state, _ = False, env.reset(), leif.clear()
t = 0
while not done:
if do_print:
time.sleep(0.1)
os.system('clear')
print(state[0]['board'])
print("\n\n")
print("Probs: \t", leif.probs[-1] if len(leif.probs) > 0 else [])
print("Val: \t", leif.values[-1] if len(leif.values) > 0 else None)
print("\nLast reward: ", last_reward, "Time", t)

action = env.act(state)
state, reward, done, info = env.step(action)
if reward[0] == -1:
last_reward = reward
break
t += 1


def readme(world):
print("Usage: ")
print("\t to train:\tpython main.py train")
print("\t to evaluate:\tpython main.py eval\n\n")
print("Procedure:")
print(
"Start the training. Wait for 300 episodes (this will generate weights file). Run evaluate. See running results.")
return None


entrypoint = next(iter(sys.argv[1:]), "readme")
locals()[entrypoint](World())
Loading