<?php

    //Q-learning algorythm in php 
    //Source: https://k...content-available-to-author-only...s.com/2010/09/24/q-learning/

    class Qlearning{
        
        function __construct(){
            //$this->df = number_format($number);
            $this->alpha = 0.1;
            $this->gamma = 0.9;
            
            // states A,B,C,D,E,F
            // e.g. from A we can go to B or D
            // from C we can only go to C 
            // C is goal state, reward 100 when B->C or F->C
            // 
            // _______
            // |A|B|C|
            // |_____|
            // |D|E|F|
            // |_____|
            //
            $this->stateA = 0;
            $this->stateB = 1;
            $this->stateC = 2;
            $this->stateD = 3;
            $this->stateE = 4;
            $this->stateF = 5;
            
            $this->statesCount = 5;
            
            $this->states = array(
                $this->stateA, 
                $this->stateB, 
                $this->stateC, 
                $this->stateD, 
                $this->stateE,
                $this->stateF
            );
            
            // Q(s,a)= Q(s,a) + alpha * (R(s,a) + gamma * Max(next state, all actions) - Q(s,a))
            
            $this->R[] = array();
            $this->Q[] = array();
            $this->actionsFromA = [ $this->stateB, $this->stateD ];
            $this->actionsFromB = [ $this->stateA, $this->stateC, $this->stateE ];
            $this->actionsFromC = [ $this->stateC ];
            $this->actionsFromD = [ $this->stateA, $this->stateE ];
            $this->actionsFromE = [ $this->stateB, $this->stateD, $this->stateF ];
            $this->actionsFromF = [ $this->stateC, $this->stateE ];
            
            $this->actions = [ 
                $this->actionsFromA,
                $this->actionsFromB,
                $this->actionsFromC,
                $this->actionsFromD,
                $this->actionsFromE,
                $this->actionsFromF
            ];
            
            $this->stateNames = [
                "A", "B", "C", "D", "F", "E"
            ];
            $this->init();
        }
        
        function init(){
            $this->R[$this->stateB][$this->stateC] = 100;
            $this->R[$this->stateF][$this->stateC] = 100;
        }
        function main(){

            $this->run();
            //$this->printResult();
           // $this->showPolicy();
            $END = time();

        }
        
        function run(){
            
        /*
         1. Set parameter , and environment reward matrix R 
         2. Initialize matrix Q as zero matrix 
         3. For each episode: Select random initial state 
            Do while not reach goal state o 
                Select one among all possible actions for the current state o 
                Using this possible action, consider to go to the next state o 
                Get maximum Q value of this next state based on all possible actions o 
                Compute o Set the next state as the current state
         */
            //var_dump($this->actions);
            for ($i = 0; $i < 1000; $i++) { // train episodes  
                $state = random_int(0, $this->statesCount);

            
               while($state != $this->stateC){
                
                $actionsFromState = $this->actions[$state];
                // Selection strategy is random in this example
                $index = random_int(0, count($actionsFromState) - 1);
               // echo $index . "<br>";
               // var_dump($actionsFromState);
               // echo "<br>";
                $action = $actionsFromState[$index];
               // echo $action;
                
                // Action outcome is set to deterministic in this example
                // Transition probability is 1
                $nextState = $action; // data structure
               // var_dump($this->Q);
                $q = $this->Q($state, $action);
              //  var_dump($q);
                $maxQ = $this->maxQ($nextState);
                $r = $this->R($state, $action);
                
              //  $value = $q + $this->alpha * ($r + $this->gamma * $maxQ - $q); 
              //  $this->setQ($state, $action, $value);
                $state = $nextState; 
               }   
            }
        }
        function maxQ(){
            
        }
        function setQ($s, $a, $value){
            $this->Q[$s][$a] = $value;
        }
        function Q($s, $a){
           return $this->Q = [$s][$a];
            
        }
        function R($s, $a){
          return  $this->R = [$s][$a];
           
        }
    }
    
    
    $q_learning = new Qlearning();
    $q_learning->main();