fork download
  1. S0 = "S0"
  2. S1 = "S1"
  3. S2 = "S2"
  4. BLUE = "blue"
  5. RED = "red"
  6. gamma = 0.5 # TODO MODIFY GAMMA HERE
  7.  
  8. # P(s'|s,a)
  9. P_destination_start_action = \
  10. {(S0,S0, BLUE):0.5,(S0,S0,RED):0.9, (S0,S1,BLUE):0.8,(S0,S1,RED):0, (S0,S2, BLUE):0,(S0,S2,RED):0,
  11. (S1,S0, BLUE):0.5,(S1,S0,RED):0, (S1,S1,BLUE):0.2,(S1,S1,RED):0.6, (S1,S2, BLUE):0,(S1,S2,RED):0,
  12. (S2,S0, BLUE):0, (S2,S0,RED):0.1, (S2,S1,BLUE):0 ,(S2,S1,RED):0.4, (S2,S2, BLUE):1,(S2,S2,RED):1}
  13.  
  14. class MDP:
  15. def __init__(self):
  16. self.states = [S0, S1, S2]
  17. self.actions = [BLUE, RED]
  18.  
  19.  
  20. self.P_dest_start_action = P_destination_start_action
  21. self.rewards = {S0: -2, S1: -5, S2: 0}
  22.  
  23. def POLICY_EVALUATION(policy_vec, utility_vec, mdp):
  24. new_utility_vector = {}
  25. for s in mdp.states:
  26. to_sum = [(mdp.P_dest_start_action[(s_tag, s, policy_vec[s])] * utility_vec[s_tag])
  27. for s_tag in mdp.states]
  28. new_utility_vector[s] = mdp.rewards[s] + gamma * sum(to_sum)
  29. return new_utility_vector
  30.  
  31. def POLICY_ITERATION(mdp):
  32. utility_vector = {state: 0 for state in mdp.states}
  33. policy_vector = {S0: BLUE, S1: RED, S2: RED}
  34. unchanged = False
  35.  
  36. while not unchanged:
  37. utility_vector = POLICY_EVALUATION(policy_vector, utility_vector, mdp)
  38. unchanged = True
  39. for s in mdp.states:
  40. BLUE_sum = sum([(mdp.P_dest_start_action[(s_tag, s, BLUE)] * utility_vector[s_tag])
  41. for s_tag in mdp.states])
  42. RED_sum = sum([(mdp.P_dest_start_action[(s_tag, s, RED)] * utility_vector[s_tag])
  43. for s_tag in mdp.states])
  44. if policy_vector[s] == RED and BLUE_sum > RED_sum:
  45. policy_vector[s] = BLUE
  46. unchanged = False
  47.  
  48. elif policy_vector[s] == BLUE and RED_sum > BLUE_sum:
  49. policy_vector[s] = RED
  50. unchanged = False
  51.  
  52. return policy_vector
  53.  
  54. if __name__ == "__main__":
  55. Q2_mdp = MDP()
  56. new_policy_vec = POLICY_ITERATION(Q2_mdp)
  57. print("===========================END===============================")
  58. print("S_O policy =", new_policy_vec[S0], " ,S_1 Policy =", new_policy_vec[S1])
  59.  
  60.  
Success #stdin #stdout 0.04s 9512KB
stdin
Standard input is empty
stdout
===========================END===============================
S_O policy = red  ,S_1 Policy = blue