import numpy as np
class LiarDieTrainer:
DOUBT, ACCEPT = 0, 1
class Node:
u, pPlayer, pOpponent = 0.0, 0.0, 0.0
def __init__(self, numActions):
self.regretSum = np.zeros(numActions)
self.strategy = np.zeros(numActions)
self.strategySum = np.zeros(numActions)
def getStrategy(self):
self.strategy = np.maximum(self.regretSum, 0)
normalizingSum = np.sum(self.strategy)
if normalizingSum > 0:
self.strategy /= normalizingSum
else:
self.strategy.fill(1.0/len(self.strategy))
self.strategySum += self.pPlayer * self.strategy
return self.strategy
def getAverageStrategy(self):
normalizingSum = np.sum(self.strategySum)
if normalizingSum > 0:
self.strategySum /= normalizingSum
else:
self.strategySum.fill(1.0/len(self.strategySum))
return self.strategySum
def __init__(self, sides):
self.sides = sides
self.responseNodes = np.empty((sides, sides+1), dtype=self.Node)
for myClaim in range(sides):
for oppClaim in range(myClaim+1, sides+1):
self.responseNodes[myClaim, oppClaim] = self.Node(1 if oppClaim == sides else 2)
self.claimNodes = np.empty((sides, sides+1), dtype=self.Node)
for oppClaim in range(sides):
for roll in range(1, sides+1):
self.claimNodes[oppClaim , roll] = self.Node(sides - oppClaim)
def train(self, iterations):
regret = np.zeros(self.sides)
rollAfterAcceptingClaim = np.zeros(self.sides, dtype=int)
for it in range(iterations):
for i in range(len(rollAfterAcceptingClaim)):
rollAfterAcceptingClaim[i] = np.random.randint(self.sides) + 1
self.claimNodes[0, rollAfterAcceptingClaim[0]].pPlayer = 1
self.claimNodes[0, rollAfterAcceptingClaim[0]].pOpponent = 1
for oppClaim in range(self.sides+1):
if oppClaim > 0:
for myClaim in range(oppClaim):
node = self.responseNodes[myClaim, oppClaim]
actionProb = node.getStrategy()
if oppClaim < self.sides:
nextNode = self.claimNodes[oppClaim, rollAfterAcceptingClaim[oppClaim]]
nextNode.pPlayer += actionProb[1] * node.pPlayer
nextNode.pOpponent += node.pOpponent
if oppClaim < self.sides:
node = self.claimNodes[oppClaim, rollAfterAcceptingClaim[oppClaim]]
actionProb = node.getStrategy()
for myClaim in range(oppClaim+1, self.sides+1):
nextClaimProb = actionProb[myClaim - oppClaim - 1]
if nextClaimProb > 0:
nextNode = self.responseNodes[oppClaim, myClaim]
nextNode.pPlayer += node.pOpponent
nextNode.pOpponent += nextClaimProb * node.pPlayer
for oppClaim in reversed(range(self.sides+1)):
if oppClaim < self.sides:
node = self.claimNodes[oppClaim, rollAfterAcceptingClaim[oppClaim]]
actionProb = node.strategy
node.u = 0.0
for myClaim in range(oppClaim+1, self.sides+1):
actionIndex = myClaim - oppClaim - 1
nextNode = self.responseNodes[oppClaim, myClaim]
childUtil = - nextNode.u
regret[actionIndex] = childUtil
node.u += actionProb[actionIndex] * childUtil
for a in range(len(actionProb)):
regret[a] -= node.u
node.regretSum[a] += node.pOpponent * regret[a]
node.pPlayer = node.pOpponent = 0
if oppClaim > 0:
for myClaim in range(oppClaim):
node = self.responseNodes[myClaim, oppClaim]
actionProb = node.strategy
node.u = 0.0
doubtUtil = 1 if oppClaim > rollAfterAcceptingClaim[myClaim] else -1
regret[self.DOUBT] = doubtUtil
node.u += actionProb[self.DOUBT] * doubtUtil
if oppClaim < self.sides:
nextNode = self.claimNodes[oppClaim, rollAfterAcceptingClaim[oppClaim]]
regret[self.ACCEPT] += nextNode.u
node.u += actionProb[self.ACCEPT] * nextNode.u
for a in range(len(actionProb)):
regret[a] -= node.u
node.regretSum[a] += node.pOpponent * regret[a]
node.pPlayer = node.pOpponent = 0
if it == iterations // 2:
for nodes in self.responseNodes:
for node in nodes:
if node:
node.strategySum.fill(0)
for nodes in self.claimNodes:
for node in nodes:
if node:
node.strategySum.fill(0)
for initialRoll in range(1, self.sides+1):
print("Initial claim policy with roll %d: %s" % (initialRoll, np.round(self.claimNodes[0, initialRoll].getAverageStrategy(), 2)))
print("\nOld Claim\tNew Claim\tAction Probabilities")
for myClaim in range(self.sides):
for oppClaim in range(myClaim+1, self.sides+1):
print("\t%d\t%d\t%s" % (myClaim, oppClaim, self.responseNodes[myClaim, oppClaim].getAverageStrategy()))
print("\nOld Claim\tRoll\tAction Probabilities")
for oppClaim in range(self.sides):
for roll in range(1, self.sides+1):
print("%d\t%d\t%s" % (oppClaim , roll, self.claimNodes[oppClaim , roll].getAverageStrategy()))
trainer = LiarDieTrainer(6)
trainer.train(1000)