;;; Code for creating random unstructured and structured MDPs (utilities at end) (in-package cl-user) (locally (declare (speed 3) (safety 0)) ;;; Random, finite, unstructured MDPs ;;;;;;;;;;;;;;;;;;;;;;; #| Unstructured random MDPs N = number of states M = number of actions B = branching factor P = array giving, for each state and action, a list of next states and probabilities R = array giving expected reward for each state and action Example: Evaluate the random 2-action policy on a set of 50 random MDPs. Random policy: (defun policy (state) '(0.5 0.5)) Make 50 MDPs, each of 100 states and a branching factor of 3: (setq MDPs (loop for i below 50 collect (make-random-MDP 100 2 3))) Evaluate them: (loop for MDP in MDPs do (print (evaluate-policy 'policy MDP .9 0))) |# (defclass MDP () (N M B P R)) (defun make-random-MDP (num-states num-actions branching-factor) (let ((MDP (make-instance 'MDP))) (with-slots (N M B P R) MDP (setq N num-states) (setq M num-actions) (setq B branching-factor) (setq P (make-array (list N M) :initial-element nil)) (setq R (make-array (list N M))) (loop for s below N do (loop for a below M do (setf (aref R s a) (random-normal)) (loop for prob in (random-simplex branching-factor) for sprime in (random-k-of-n B N) do (setf (aref P s a) (add-prob prob sprime (aref P s a)))))) MDP))) (defmethod next-state ((MDP MDP) s a &optional (random-state *random-state*)) (with-slots (P) MDP (let ((R (random 1.0 random-state))) (loop for (s-prime . prob) in (aref P s a) sum prob into cumulative-prob until (>= cumulative-prob R) finally (return s-prime))))) (defmethod expected-reward ((MDP MDP) s a) (with-slots (R) MDP (aref R s a))) (defun evaluate-policy (policy MDP gamma start-state &optional (threshold .0000001)) "DP policy evaluation of the policy (mapping from states to action probs)" (with-slots (N M B P R) MDP (let ((V (make-array N :initial-element 0))) (loop for delta = 0 do (loop for s below N for old-V = (aref V s) do (setf (aref V s) (loop for a below M for prob in (funcall policy s) sum (* prob (+ (aref R s a) (loop for (sp . pr) in (aref P s a) sum (* pr gamma (aref V sp))))))) (if (< delta (abs (- old-V (aref V s)))) (setq delta (abs (- old-V (aref V s)))))) until (< delta threshold)) (aref V start-state)))) (defun add-prob (increment outcome prob-list) (if (loop for s.prob in prob-list for (s . prob) in prob-list thereis (if (= s outcome) (setf (cdr s.prob) (+ prob increment)))) prob-list (cons (cons outcome increment) prob-list))) ;;; Random structured MDPs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; #| Structured MDPs are MDPs with state VARIABLES, each of which can take on a (small) number of values in {0,1,2,...,k}. Actions correspond to rules, one per variable, for describing the probability distribution over values of that variable, plus another set of rules for additive components of the expected reward. A structed MDP is a list of actions and an array of numbers of values for the variables. Each action is a list of transition rules (one per variable) and a list of reward rules. A transition rule for a variable, a list of antecedents, and an array of simplexes (a table of probabilities for the variable taking on each of its values, indexed by the antecedents' values). A reward rule is a list of antecedents and a table of reward components. Example: Evaluate the random 2-action policy on a set of 50 random structured MDPs. Random policy: (defun sample-policy (state) (random 2)) Make 50 MDPs, each with 10 binary state vars, where both actions have 4 transition rules and 1 reward rule, and all rules depend on 3 antecedent variables (setq MDPs (loop for i below 50 collect (make-random-structured-MDP (loop repeat 10 collect 2) 2 4 3 1 3))) Evaluate them: (loop for MDP in MDPs do (print (MC-policy-evaluation 'sample-policy MDP .9 100 .01))) |# (defclass Structured-MDP () (actions num-values-of-each-var hash-table)) (defclass action () (transition-rules reward-rules)) (defclass transition-rule () (consequent antecedents simplexes)) (defclass reward-rule () (antecedents reward-components)) (defun make-structured-MDP (num-values-of-each-var &optional actions) (let ((MDP (make-instance 'Structured-MDP))) (setf (slot-value MDP 'actions) actions) (setf (slot-value MDP 'num-values-of-each-var) num-values-of-each-var) (setf (slot-value MDP 'hash-table) (make-hash-table :test #'equal)) MDP)) (defun num-values (Structured-MDP variable-num) (with-slots (num-values-of-each-var) Structured-MDP (nth variable-num num-values-of-each-var))) (defun num-actions (Structured-MDP) (with-slots (actions) Structured-MDP (length actions))) (defun num-variables (structured-MDP) (with-slots (num-values-of-each-var) Structured-MDP (length num-values-of-each-var))) (defun add-action (Structured-MDP &optional transition-rules reward-rules) "Adds a new action to the structured MDP, returns the action number" (let ((action (make-instance 'action))) (setf (slot-value action 'transition-rules) transition-rules) (setf (slot-value action 'reward-rules) reward-rules) (with-slots (actions) structured-MDP (setf actions (append actions (list action))) (- (length actions) 1)))) (defun add-transition-rule (Structured-MDP action-num consequent antecedents simplexes) "Adds a new transition-rule to the indicated action of the structured MDP" (let ((transition-rule (make-instance 'transition-rule))) (setf (slot-value transition-rule 'consequent) consequent) (setf (slot-value transition-rule 'antecedents) antecedents) (setf (slot-value transition-rule 'simplexes) simplexes) (with-slots (transition-rules) (nth action-num (slot-value structured-MDP 'actions)) (setf transition-rules (append transition-rules (list transition-rule)))))) (defun add-reward-rule (Structured-MDP action-num antecedents reward-components) "Adds a new reward-rule to the indicated action of the structured MDP" (let ((reward-rule (make-instance 'reward-rule))) (setf (slot-value reward-rule 'antecedents) antecedents) (setf (slot-value reward-rule 'reward-components) reward-components) (with-slots (reward-rules) (nth action-num (slot-value structured-MDP 'actions)) (setf reward-rules (append reward-rules (list reward-rule)))))) (defun add-random-reward-rule (Structured-MDP action-num K) (let* ((antecedents (random-k-of-n K (num-variables Structured-MDP))) (num-values (loop for a in antecedents collect (num-values Structured-MDP a)))) (add-reward-rule Structured-MDP action-num antecedents (make-array num-values :initial-contents (structure-of-random-numbers num-values))))) (defun structure-of-random-numbers (structure) (if (null structure) (random-normal) (loop repeat (first structure) collect (structure-of-random-numbers (rest structure))))) (defun add-random-transition-rule (Structured-MDP action-num consequent K simplex-generator) (let* ((antecedents (random-k-of-n K (num-variables Structured-MDP))) (num-values (loop for a in antecedents collect (num-values structured-MDP a))) (CPT (make-array num-values))) (funcall-on-indices nil num-values (lambda (antecedent-values) (setf (apply #'aref CPT antecedent-values) (simplex-to-fastsimplex (funcall simplex-generator antecedent-values action-num (num-values structured-MDP consequent)))))) (add-transition-rule Structured-MDP action-num consequent antecedents CPT))) (defun funcall-on-indices (prefix remaining function) (if (null remaining) (funcall function prefix) (loop for i below (first remaining) do (funcall-on-indices (append prefix (list i)) (rest remaining) function)))) (defun make-random-Structured-MDP (num-values-of-each-var num-actions num-transition-rules num-transition-antecedents num-reward-rules num-reward-antecedents &optional (simplex-generator #'random-simplex-generator)) (let ((MDP (make-structured-MDP num-values-of-each-var))) (loop repeat num-actions for action-num = (add-action MDP) do (loop repeat num-transition-rules for consequent in (random-k-of-n num-transition-rules (num-variables MDP)) do (add-random-transition-rule MDP action-num consequent num-transition-antecedents simplex-generator)) do (loop repeat num-reward-rules do (add-random-reward-rule MDP action-num num-reward-antecedents))) MDP)) (defun random-simplex-generator (antecedent-values action num-values) (declare (ignore antecedent-values) (ignore action)) (random-simplex num-values)) #| (defmethod next-state ((MDP Structured-MDP) state action &optional (random-state *random-state*)) "The new list of state variable values as a consequence of taking action" (let ((new-state (copy-list state))) (loop for rule in (slot-value (nth action (slot-value MDP 'actions)) 'transition-rules) do (with-slots (consequent antecedents simplexes) rule (setf (nth consequent new-state) (random-from-simplex (apply #'aref simplexes (loop for a in antecedents collect (nth a state))) random-state)))) new-state)) |# (defmethod next-state ((MDP Structured-MDP) state action &optional (random-state *random-state*)) "The new list of state variable values as a consequence of taking action" (with-slots (hash-table) MDP (let ((simplexes (gethash (cons action state) hash-table))) (unless simplexes (setq simplexes (collect-simplexes MDP state action)) (setf (gethash (cons action state) hash-table) simplexes)) (loop for simplex in simplexes collect (random-from-fastsimplex simplex random-state))))) (defun collect-simplexes (MDP state action) (with-slots (transition-rules) (nth action (slot-value MDP 'actions)) (loop for var below (num-variables MDP) collect (loop for rule in transition-rules do (with-slots (consequent antecedents simplexes) rule (if (= var consequent) (return (apply #'aref simplexes (loop for a in antecedents collect (nth a state)))))) finally (return (deterministic-simplex (nth var state))))))) (defmethod expected-reward ((MDP Structured-MDP) state action) (loop for rule in (slot-value (nth action (slot-value MDP 'actions)) 'reward-rules) sum (with-slots (antecedents reward-components) rule (apply #'aref reward-components (loop for a in antecedents collect (nth a state)))))) (defun MC-policy-evaluation (sample-policy MDP gamma &optional (num-samples 1000) (threshold .001) (start-state (loop repeat (num-variables MDP) collect 0))) "Monte-Carlo policy evaluation of the policy (mapping from states to action probs)" (stats (loop with state = start-state repeat num-samples collect (loop for multiplier = 1 then (* multiplier gamma) until (< multiplier threshold) for action = (funcall sample-policy state) sum (* multiplier (expected-reward MDP state action)) do (setq state (next-state MDP state action)))))) ;;; utility routines ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defmacro square (x) `(if (> (abs ,x) 1e10) 1e20 (* ,x ,x))) (defun random-normal (&optional (random-state *random-state*)) (do ((u 0.0) (v 0.0)) ((progn (setq u (random 1.0 random-state) ; U is bounded (0 1) v (* 2.0 (sqrt 2.0) (exp -0.5) ; V is bounded (-MAX MAX) (- (random 1.0 random-state) 0.5))) (<= (* v v) (* -4.0 u u (log u)))) ; < should be <= (/ v u)) (declare (double-float u v)))) (defun mean (l) (float (/ (loop for i in l sum i) (length l)))) (defun mse (target values) (mean (loop for v in values collect (square (- v target))))) (defun rmse (target values) ;root mean square error (sqrt (mse target values))) (defun stdev (l) (rmse (mean l) l)) (defun stats (list) (list (mean list) (/ (stdev list) (sqrt (length list))))) (defun random-k-of-n (k n &optional (random-state *random-state*)) (loop for i = (random n random-state) unless (member i result) collect i into result until (= k (length result)) finally (return result))) (defun random-simplex (n &optional (random-state *random-state*)) "returns n positive numbers that sum to 1.0 -- a random partition of unity" (loop for last-r = 0 then r for r in (sort (cons 1.0 (loop repeat (- n 1) collect (random 1.0 random-state))) #'<) collect (- r last-r))) (defun random-from-simplex (simplex &optional (random-state *random-state*)) "random sample from {0,...,n-1} where simplex is list of the n probabilities (must sum to 1)" (let ((R (random 1.0 random-state))) (loop for i below (- (length simplex) 1) for prob in simplex for cumulative-prob = prob then (+ cumulative-prob prob) ; little optimizations until (>= cumulative-prob R) finally (return i)))) (defconstant fast-simplex-max 10000) (defun random-from-fastsimplex (integer-csimplex &optional (random-state *random-state*)) "like random-simplex but from an integer cumulative simplex, a CDF from 0 to fast-simplex-max instead of 0 to 1.0" (let ((R (random fast-simplex-max random-state))) (loop for i below (- (length integer-csimplex) 1) for cumulative-prob in integer-csimplex until (>= cumulative-prob R) finally (return i)))) (defun deterministic-simplex (k) "returns the simplex that puts all the weight on the kth choice" (loop with list = (list 1) repeat k do (push 0 list) finally (return list))) (defun deterministic-fastsimplex (k) "returns the simplex that puts all the weight on the kth choice" (loop with list = (list fast-simplex-max) repeat k do (push 0 list) finally (return list))) (defun simplex-to-fastsimplex (simplex) (append (loop for p in (butlast simplex) sum p into sump collect (round (* sump fast-simplex-max))) (list fast-simplex-max))) )