fork(1) download
  1. import numpy as np
  2.  
  3. class LiarDieTrainer:
  4. DOUBT, ACCEPT = 0, 1
  5.  
  6. class Node:
  7. u, pPlayer, pOpponent = 0.0, 0.0, 0.0
  8.  
  9. def __init__(self, numActions):
  10. self.regretSum = np.zeros(numActions)
  11. self.strategy = np.zeros(numActions)
  12. self.strategySum = np.zeros(numActions)
  13.  
  14. def getStrategy(self):
  15. self.strategy = np.maximum(self.regretSum, 0)
  16. normalizingSum = np.sum(self.strategy)
  17. if normalizingSum > 0:
  18. self.strategy /= normalizingSum
  19. else:
  20. self.strategy.fill(1.0/len(self.strategy))
  21. self.strategySum += self.pPlayer * self.strategy
  22. return self.strategy
  23.  
  24. def getAverageStrategy(self):
  25. normalizingSum = np.sum(self.strategySum)
  26. if normalizingSum > 0:
  27. self.strategySum /= normalizingSum
  28. else:
  29. self.strategySum.fill(1.0/len(self.strategySum))
  30. return self.strategySum
  31.  
  32. def __init__(self, sides):
  33. self.sides = sides
  34. self.responseNodes = np.empty((sides, sides+1), dtype=self.Node)
  35. for myClaim in range(sides):
  36. for oppClaim in range(myClaim+1, sides+1):
  37. self.responseNodes[myClaim, oppClaim] = self.Node(1 if oppClaim == sides else 2)
  38. self.claimNodes = np.empty((sides, sides+1), dtype=self.Node)
  39. for oppClaim in range(sides):
  40. for roll in range(1, sides+1):
  41. self.claimNodes[oppClaim , roll] = self.Node(sides - oppClaim)
  42.  
  43. def train(self, iterations):
  44. regret = np.zeros(self.sides)
  45. rollAfterAcceptingClaim = np.zeros(self.sides, dtype=int)
  46. for it in range(iterations):
  47. for i in range(len(rollAfterAcceptingClaim)):
  48. rollAfterAcceptingClaim[i] = np.random.randint(self.sides) + 1
  49. self.claimNodes[0, rollAfterAcceptingClaim[0]].pPlayer = 1
  50. self.claimNodes[0, rollAfterAcceptingClaim[0]].pOpponent = 1
  51.  
  52. for oppClaim in range(self.sides+1):
  53. if oppClaim > 0:
  54. for myClaim in range(oppClaim):
  55. node = self.responseNodes[myClaim, oppClaim]
  56. actionProb = node.getStrategy()
  57. if oppClaim < self.sides:
  58. nextNode = self.claimNodes[oppClaim, rollAfterAcceptingClaim[oppClaim]]
  59. nextNode.pPlayer += actionProb[1] * node.pPlayer
  60. nextNode.pOpponent += node.pOpponent
  61.  
  62. if oppClaim < self.sides:
  63. node = self.claimNodes[oppClaim, rollAfterAcceptingClaim[oppClaim]]
  64. actionProb = node.getStrategy()
  65. for myClaim in range(oppClaim+1, self.sides+1):
  66. nextClaimProb = actionProb[myClaim - oppClaim - 1]
  67. if nextClaimProb > 0:
  68. nextNode = self.responseNodes[oppClaim, myClaim]
  69. nextNode.pPlayer += node.pOpponent
  70. nextNode.pOpponent += nextClaimProb * node.pPlayer
  71.  
  72. for oppClaim in reversed(range(self.sides+1)):
  73. if oppClaim < self.sides:
  74. node = self.claimNodes[oppClaim, rollAfterAcceptingClaim[oppClaim]]
  75. actionProb = node.strategy
  76. node.u = 0.0
  77. for myClaim in range(oppClaim+1, self.sides+1):
  78. actionIndex = myClaim - oppClaim - 1
  79. nextNode = self.responseNodes[oppClaim, myClaim]
  80. childUtil = - nextNode.u
  81. regret[actionIndex] = childUtil
  82. node.u += actionProb[actionIndex] * childUtil
  83. for a in range(len(actionProb)):
  84. regret[a] -= node.u
  85. node.regretSum[a] += node.pOpponent * regret[a]
  86. node.pPlayer = node.pOpponent = 0
  87.  
  88. if oppClaim > 0:
  89. for myClaim in range(oppClaim):
  90. node = self.responseNodes[myClaim, oppClaim]
  91. actionProb = node.strategy
  92. node.u = 0.0
  93. doubtUtil = 1 if oppClaim > rollAfterAcceptingClaim[myClaim] else -1
  94. regret[self.DOUBT] = doubtUtil
  95. node.u += actionProb[self.DOUBT] * doubtUtil
  96. if oppClaim < self.sides:
  97. nextNode = self.claimNodes[oppClaim, rollAfterAcceptingClaim[oppClaim]]
  98. regret[self.ACCEPT] += nextNode.u
  99. node.u += actionProb[self.ACCEPT] * nextNode.u
  100. for a in range(len(actionProb)):
  101. regret[a] -= node.u
  102. node.regretSum[a] += node.pOpponent * regret[a]
  103. node.pPlayer = node.pOpponent = 0
  104.  
  105. if it == iterations // 2:
  106. for nodes in self.responseNodes:
  107. for node in nodes:
  108. if node:
  109. node.strategySum.fill(0)
  110. for nodes in self.claimNodes:
  111. for node in nodes:
  112. if node:
  113. node.strategySum.fill(0)
  114.  
  115. for initialRoll in range(1, self.sides+1):
  116. print("Initial claim policy with roll %d: %s" % (initialRoll, np.round(self.claimNodes[0, initialRoll].getAverageStrategy(), 2)))
  117. print("\nOld Claim\tNew Claim\tAction Probabilities")
  118. for myClaim in range(self.sides):
  119. for oppClaim in range(myClaim+1, self.sides+1):
  120. print("\t%d\t%d\t%s" % (myClaim, oppClaim, self.responseNodes[myClaim, oppClaim].getAverageStrategy()))
  121. print("\nOld Claim\tRoll\tAction Probabilities")
  122. for oppClaim in range(self.sides):
  123. for roll in range(1, self.sides+1):
  124. print("%d\t%d\t%s" % (oppClaim , roll, self.claimNodes[oppClaim , roll].getAverageStrategy()))
  125.  
  126. trainer = LiarDieTrainer(6)
  127. trainer.train(1000)
  128.  
Success #stdin #stdout 0.77s 25004KB
stdin
Standard input is empty
stdout
Initial claim policy with roll 1: [ 1.  0.  0.  0.  0.  0.]
Initial claim policy with roll 2: [ 0.  1.  0.  0.  0.  0.]
Initial claim policy with roll 3: [ 0.  0.  1.  0.  0.  0.]
Initial claim policy with roll 4: [ 0.  0.  0.  1.  0.  0.]
Initial claim policy with roll 5: [ 0.  0.  0.  0.  1.  0.]
Initial claim policy with roll 6: [ 0.  0.  0.  0.  0.  1.]

Old Claim	New Claim	Action Probabilities
	0	1	[ 0.22093023  0.77906977]
	0	2	[ 0.34375  0.65625]
	0	3	[ 0.29577465  0.70422535]
	0	4	[ 0.5  0.5]
	0	5	[ 1.  0.]
	0	6	[ 1.]
	1	2	[ 0.5  0.5]
	1	3	[ 0.5  0.5]
	1	4	[ 1.  0.]
	1	5	[ 1.  0.]
	1	6	[ 1.]
	2	3	[ 1.  0.]
	2	4	[ 1.  0.]
	2	5	[ 1.  0.]
	2	6	[ 1.]
	3	4	[ 1.  0.]
	3	5	[ 1.  0.]
	3	6	[ 1.]
	4	5	[ 1.  0.]
	4	6	[ 1.]
	5	6	[ 1.]

Old Claim	Roll	Action Probabilities
0	1	[ 0.99568448  0.          0.          0.00431552  0.          0.        ]
0	2	[  2.68099742e-04   9.99731900e-01   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00]
0	3	[ 0.  0.  1.  0.  0.  0.]
0	4	[ 0.  0.  0.  1.  0.  0.]
0	5	[ 0.  0.  0.  0.  1.  0.]
0	6	[ 0.  0.  0.  0.  0.  1.]
1	1	[ 0.6040523  0.3959477  0.         0.         0.       ]
1	2	[ 1.  0.  0.  0.  0.]
1	3	[ 0.  1.  0.  0.  0.]
1	4	[ 0.          0.08349967  0.91650033  0.          0.        ]
1	5	[ 0.   0.   0.5  0.5  0. ]
1	6	[ 0.          0.          0.33333333  0.33333333  0.33333333]
2	1	[ 1.  0.  0.  0.]
2	2	[ 1.  0.  0.  0.]
2	3	[ 1.  0.  0.  0.]
2	4	[ 0.  1.  0.  0.]
2	5	[ 0.   0.5  0.5  0. ]
2	6	[ 0.   0.   0.5  0.5]
3	1	[ 0.33333333  0.33333333  0.33333333]
3	2	[ 0.28571429  0.71428571  0.        ]
3	3	[ 0.33333333  0.33333333  0.33333333]
3	4	[ 1.  0.  0.]
3	5	[ 0.5  0.5  0. ]
3	6	[ 0.33333333  0.33333333  0.33333333]
4	1	[ 0.5  0.5]
4	2	[ 1.  0.]
4	3	[ 0.5  0.5]
4	4	[ 0.5  0.5]
4	5	[ 1.  0.]
4	6	[ 0.5  0.5]
5	1	[ 1.]
5	2	[ 1.]
5	3	[ 1.]
5	4	[ 1.]
5	5	[ 1.]
5	6	[ 1.]