import numpy as np
import random
np.set_printoptions(precision=4, suppress=True)
ROCK, PAPER, SCISSORS = 0, 1, 2
NUM_ACTIONS = 3
def normalize(strategy):
strategy = np.copy(strategy)
normalizingSum = np.sum(strategy)
if normalizingSum > 0:
strategy /= normalizingSum
else:
strategy = np.ones(NUM_ACTIONS) / NUM_ACTIONS
return strategy
def get_strategy(regret_sum):
return normalize(np.maximum(regret_sum, 0))
def get_action(strategy):
strategy = strategy / np.sum(strategy)
return np.searchsorted(np.cumsum(strategy), random.random())
def value(p1, p2):
if p1 == p2:
return 0
elif p1 == ROCK and p2 == SCISSORS:
return 1
elif p1 == SCISSORS and p2 == PAPER:
return 1
elif p1 == PAPER and p2 == ROCK:
return 1
return -1
def train(iterations):
oppStrategy = np.array([1/3, 1/3, 1/3])
regretSum = np.zeros(NUM_ACTIONS)
strategySum = np.zeros(NUM_ACTIONS)
action_utility = np.zeros(NUM_ACTIONS)
for i in range(iterations):
# Get regret-matched mixed-strategy actions
strategy = get_strategy(regretSum)
strategySum += strategy
myAction = get_action(strategy)
otherAction = get_action(oppStrategy)
# Compute action utilities
action_utility[otherAction] = 0
action_utility[(otherAction + 1) % NUM_ACTIONS] = 1
action_utility[(otherAction - 1) % NUM_ACTIONS] = -1
# Accumulate action regrets
regretSum += action_utility - action_utility[myAction]
return strategySum
def train2p(iterations):
regretSum1 = np.zeros(NUM_ACTIONS)
strategySum1 = np.zeros(NUM_ACTIONS)
action_utility = np.zeros(NUM_ACTIONS)
regretSum2 = np.zeros(NUM_ACTIONS)
strategySum2 = np.zeros(NUM_ACTIONS)
for i in range(iterations):
# Get regret-matched mixed-strategy actions
strategy1 = get_strategy(regretSum1)
strategy2 = get_strategy(regretSum2)
strategySum1 += strategy1
myAction = get_action(normalize(strategy1))
otherAction = get_action(normalize(strategy2))
# Compute action utilities
action_utility[otherAction] = 0
action_utility[(otherAction + 1) % NUM_ACTIONS] = 1
action_utility[(otherAction - 1) % NUM_ACTIONS] = -1
# Accumulate action regrets
regretSum1 += action_utility - action_utility[myAction]
regretSum2 += -(action_utility - action_utility[myAction])
return strategySum1, strategySum2
if __name__ == "__main__":
ITERATIONS = 10000
trained_strategy = train(ITERATIONS)
print(normalize(trained_strategy))
trained_strategy1, trained_strategy2 = train2p(ITERATIONS)
print(normalize(trained_strategy1), normalize(trained_strategy2))