fork(1) download
  1. import numpy as np
  2. import random
  3. np.set_printoptions(precision=4, suppress=True)
  4.  
  5. ROCK, PAPER, SCISSORS = 0, 1, 2
  6. NUM_ACTIONS = 3
  7.  
  8.  
  9. def normalize(strategy):
  10. strategy = np.copy(strategy)
  11. normalizingSum = np.sum(strategy)
  12. if normalizingSum > 0:
  13. strategy /= normalizingSum
  14. else:
  15. strategy = np.ones(NUM_ACTIONS) / NUM_ACTIONS
  16. return strategy
  17.  
  18.  
  19. def get_strategy(regret_sum):
  20. return normalize(np.maximum(regret_sum, 0))
  21.  
  22.  
  23. def get_action(strategy):
  24. strategy = strategy / np.sum(strategy)
  25. return np.searchsorted(np.cumsum(strategy), random.random())
  26.  
  27.  
  28. def value(p1, p2):
  29. if p1 == p2:
  30. return 0
  31. elif p1 == ROCK and p2 == SCISSORS:
  32. return 1
  33. elif p1 == SCISSORS and p2 == PAPER:
  34. return 1
  35. elif p1 == PAPER and p2 == ROCK:
  36. return 1
  37. return -1
  38.  
  39.  
  40. def train(iterations):
  41. oppStrategy = np.array([1/3, 1/3, 1/3])
  42. regretSum = np.zeros(NUM_ACTIONS)
  43. strategySum = np.zeros(NUM_ACTIONS)
  44. action_utility = np.zeros(NUM_ACTIONS)
  45. for i in range(iterations):
  46.  
  47. # Get regret-matched mixed-strategy actions
  48. strategy = get_strategy(regretSum)
  49. strategySum += strategy
  50. myAction = get_action(strategy)
  51. otherAction = get_action(oppStrategy)
  52.  
  53. # Compute action utilities
  54. action_utility[otherAction] = 0
  55. action_utility[(otherAction + 1) % NUM_ACTIONS] = 1
  56. action_utility[(otherAction - 1) % NUM_ACTIONS] = -1
  57.  
  58. # Accumulate action regrets
  59. regretSum += action_utility - action_utility[myAction]
  60. return strategySum
  61.  
  62.  
  63. def train2p(iterations):
  64. regretSum1 = np.zeros(NUM_ACTIONS)
  65. strategySum1 = np.zeros(NUM_ACTIONS)
  66. action_utility = np.zeros(NUM_ACTIONS)
  67.  
  68. regretSum2 = np.zeros(NUM_ACTIONS)
  69. strategySum2 = np.zeros(NUM_ACTIONS)
  70. for i in range(iterations):
  71. # Get regret-matched mixed-strategy actions
  72. strategy1 = get_strategy(regretSum1)
  73. strategy2 = get_strategy(regretSum2)
  74. strategySum1 += strategy1
  75. myAction = get_action(normalize(strategy1))
  76. otherAction = get_action(normalize(strategy2))
  77.  
  78. # Compute action utilities
  79. action_utility[otherAction] = 0
  80. action_utility[(otherAction + 1) % NUM_ACTIONS] = 1
  81. action_utility[(otherAction - 1) % NUM_ACTIONS] = -1
  82.  
  83. # Accumulate action regrets
  84. regretSum1 += action_utility - action_utility[myAction]
  85. regretSum2 += -(action_utility - action_utility[myAction])
  86. return strategySum1, strategySum2
  87.  
  88.  
  89. if __name__ == "__main__":
  90. ITERATIONS = 10000
  91. trained_strategy = train(ITERATIONS)
  92. print(normalize(trained_strategy))
  93.  
  94. trained_strategy1, trained_strategy2 = train2p(ITERATIONS)
  95. print(normalize(trained_strategy1), normalize(trained_strategy2))
  96.  
Success #stdin #stdout 2.13s 27392KB
stdin
Standard input is empty
stdout
[0.4244 0.481  0.0947]
[0.3382 0.3422 0.3196] [0.3333 0.3333 0.3333]