Copied!







import sys
if "google.colab" in sys.modules:
    %pip install "inferactively-pymdp[nb]" -q

import sys
if "google.colab" in sys.modules:
    %pip install "inferactively-pymdp[nb]" -q





Copied!







import sys
if "google.colab" in sys.modules:
    %pip install "inferactively-pymdp[nb]" -q

import sys
if "google.colab" in sys.modules:
    %pip install "inferactively-pymdp[nb]" -q





Copied!







%load_ext autoreload
%autoreload 2

import jax.numpy as jnp
import jax.random as jr

import matplotlib.pyplot as plt
import numpy as np
import mediapy

from pymdp.envs import SimplifiedTMaze, rollout
from pymdp.agent import Agent
from pymdp.planning.si import si_policy_search

%load_ext autoreload
%autoreload 2

import jax.numpy as jnp
import jax.random as jr

import matplotlib.pyplot as plt
import numpy as np
import mediapy

from pymdp.envs import SimplifiedTMaze, rollout
from pymdp.agent import Agent
from pymdp.planning.si import si_policy_search





Copied!







%load_ext autoreload
%autoreload 2

import jax.numpy as jnp
import jax.random as jr

import matplotlib.pyplot as plt
import numpy as np
import mediapy

from pymdp.envs import SimplifiedTMaze, rollout
from pymdp.agent import Agent
from pymdp.planning.si import si_policy_search

%load_ext autoreload
%autoreload 2

import jax.numpy as jnp
import jax.random as jr

import matplotlib.pyplot as plt
import numpy as np
import mediapy

from pymdp.envs import SimplifiedTMaze, rollout
from pymdp.agent import Agent
from pymdp.planning.si import si_policy_search





Copied!







# setting the parameters for the environment
reward_condition = 0 # 0 is reward in left arm, 1 is reward in right arm, None is random allocation
cue_validity = 0.95 # 95% valid cues

reward_probability = 1.0 # 100% chance of reward in the correct arm
dependent_outcomes = True # if True, punishment occurs as a function of reward probability (i.e., if reward probability is 0.8, then 20% punishment). If False, punishment occurs with set probability (i.e., 20% no outcome and punishment will only occur in the other (non-rewarding) arm depending on the punishment_probability parameter)

# setting the parameters for the environment
reward_condition = 0 # 0 is reward in left arm, 1 is reward in right arm, None is random allocation
cue_validity = 0.95 # 95% valid cues

reward_probability = 1.0 # 100% chance of reward in the correct arm
dependent_outcomes = True # if True, punishment occurs as a function of reward probability (i.e., if reward probability is 0.8, then 20% punishment). If False, punishment occurs with set probability (i.e., 20% no outcome and punishment will only occur in the other (non-rewarding) arm depending on the punishment_probability parameter)





Copied!







# setting the parameters for the environment
reward_condition = 0 # 0 is reward in left arm, 1 is reward in right arm, None is random allocation
cue_validity = 0.95 # 95% valid cues

reward_probability = 1.0 # 100% chance of reward in the correct arm
dependent_outcomes = True # if True, punishment occurs as a function of reward probability (i.e., if reward probability is 0.8, then 20% punishment). If False, punishment occurs with set probability (i.e., 20% no outcome and punishment will only occur in the other (non-rewarding) arm depending on the punishment_probability parameter)

# setting the parameters for the environment
reward_condition = 0 # 0 is reward in left arm, 1 is reward in right arm, None is random allocation
cue_validity = 0.95 # 95% valid cues

reward_probability = 1.0 # 100% chance of reward in the correct arm
dependent_outcomes = True # if True, punishment occurs as a function of reward probability (i.e., if reward probability is 0.8, then 20% punishment). If False, punishment occurs with set probability (i.e., 20% no outcome and punishment will only occur in the other (non-rewarding) arm depending on the punishment_probability parameter)





Copied!







# initializing the environment. see si_tmaze.py in pymdp/envs for the implementation details
env = SimplifiedTMaze(
    reward_condition=reward_condition,
    cue_validity=cue_validity,  
    reward_probability=reward_probability,     
    dependent_outcomes=dependent_outcomes,
)

# initializing the environment. see si_tmaze.py in pymdp/envs for the implementation details
env = SimplifiedTMaze(
    reward_condition=reward_condition,
    cue_validity=cue_validity,  
    reward_probability=reward_probability,     
    dependent_outcomes=dependent_outcomes,
)





Copied!







# initializing the environment. see si_tmaze.py in pymdp/envs for the implementation details
env = SimplifiedTMaze(
    reward_condition=reward_condition,
    cue_validity=cue_validity,  
    reward_probability=reward_probability,     
    dependent_outcomes=dependent_outcomes,
)

# initializing the environment. see si_tmaze.py in pymdp/envs for the implementation details
env = SimplifiedTMaze(
    reward_condition=reward_condition,
    cue_validity=cue_validity,  
    reward_probability=reward_probability,     
    dependent_outcomes=dependent_outcomes,
)





Copied!







# creating C tensors filled with zeros for [location], [reward], [cue] based on A shapes for the Agent
C = [jnp.zeros(a.shape[0], dtype=jnp.float32) for a in env.A] 

# setting preferences for outcomes only
C[1] = C[1].at[1].set(2.0)    # prefer reward
C[1] = C[1].at[2].set(-6.0)   # avoid punishment

# slight cost of observing a cue
# C[0] = C[0].at[3].set(-1.0).at[4].set(-1.0)

# creating C tensors filled with zeros for [location], [reward], [cue] based on A shapes for the Agent
C = [jnp.zeros(a.shape[0], dtype=jnp.float32) for a in env.A] 

# setting preferences for outcomes only
C[1] = C[1].at[1].set(2.0)    # prefer reward
C[1] = C[1].at[2].set(-6.0)   # avoid punishment

# slight cost of observing a cue
# C[0] = C[0].at[3].set(-1.0).at[4].set(-1.0)





Copied!







# creating C tensors filled with zeros for [location], [reward], [cue] based on A shapes for the Agent
C = [jnp.zeros(a.shape[0], dtype=jnp.float32) for a in env.A] 

# setting preferences for outcomes only
C[1] = C[1].at[1].set(2.0)    # prefer reward
C[1] = C[1].at[2].set(-6.0)   # avoid punishment

# slight cost of observing a cue
# C[0] = C[0].at[3].set(-1.0).at[4].set(-1.0)

# creating C tensors filled with zeros for [location], [reward], [cue] based on A shapes for the Agent
C = [jnp.zeros(a.shape[0], dtype=jnp.float32) for a in env.A] 

# setting preferences for outcomes only
C[1] = C[1].at[1].set(2.0)    # prefer reward
C[1] = C[1].at[2].set(-6.0)   # avoid punishment

# slight cost of observing a cue
# C[0] = C[0].at[3].set(-1.0).at[4].set(-1.0)





Copied!







# flat D tensors [location], [reward] based on B shapes for the agent
D = [jnp.ones(b.shape[0], dtype=jnp.float32) / b.shape[0] for b in env.B]

# flat D tensors [location], [reward] based on B shapes for the agent
D = [jnp.ones(b.shape[0], dtype=jnp.float32) / b.shape[0] for b in env.B]





Copied!







# flat D tensors [location], [reward] based on B shapes for the agent
D = [jnp.ones(b.shape[0], dtype=jnp.float32) / b.shape[0] for b in env.B]

# flat D tensors [location], [reward] based on B shapes for the agent
D = [jnp.ones(b.shape[0], dtype=jnp.float32) / b.shape[0] for b in env.B]





Copied!







# note that we initialize agents with different policy lengths for the vanilla vs sophisticated inference planning algorithms
# even though both will eventually end up planning with a horizon of 2. The sophisticated inference planning algorithm requires
# a policy length of 1 in the Agent as we specify horizon length of 2 when initializing the planning algorithm in the `rollout`.

# action_selection="deterministic" means selecting an action from the policy probability distribution (q_pi) by arg-maxxing
# sampling_mode="full" means evaluating the whole action sequence in each policy and executing the first action (as opposed to marginal where the agent evaluates each action type)
policy_len = 2
agent_vanilla = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=policy_len,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=3.0
)

agent_si = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=1,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=3.0
)

# note that we initialize agents with different policy lengths for the vanilla vs sophisticated inference planning algorithms
# even though both will eventually end up planning with a horizon of 2. The sophisticated inference planning algorithm requires
# a policy length of 1 in the Agent as we specify horizon length of 2 when initializing the planning algorithm in the `rollout`.

# action_selection="deterministic" means selecting an action from the policy probability distribution (q_pi) by arg-maxxing
# sampling_mode="full" means evaluating the whole action sequence in each policy and executing the first action (as opposed to marginal where the agent evaluates each action type)
policy_len = 2
agent_vanilla = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=policy_len,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=3.0
)

agent_si = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=1,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=3.0
)

/var/folders/_f/1qqqnkyd5k5g2b1pgfwzzrqm0000gn/T/ipykernel_61048/282069077.py:8: UserWarning: A JAX array is being set as static! This can result in unexpected behavior and is usually a mistake to do.
  agent_vanilla = Agent(
/var/folders/_f/1qqqnkyd5k5g2b1pgfwzzrqm0000gn/T/ipykernel_61048/282069077.py:20: UserWarning: A JAX array is being set as static! This can result in unexpected behavior and is usually a mistake to do.
  agent_si = Agent(





Copied!







# note that we initialize agents with different policy lengths for the vanilla vs sophisticated inference planning algorithms
# even though both will eventually end up planning with a horizon of 2. The sophisticated inference planning algorithm requires
# a policy length of 1 in the Agent as we specify horizon length of 2 when initializing the planning algorithm in the `rollout`.

# action_selection="deterministic" means selecting an action from the policy probability distribution (q_pi) by arg-maxxing
# sampling_mode="full" means evaluating the whole action sequence in each policy and executing the first action (as opposed to marginal where the agent evaluates each action type)
policy_len = 2
agent_vanilla = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=policy_len,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=3.0
)

agent_si = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=1,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=3.0
)

# note that we initialize agents with different policy lengths for the vanilla vs sophisticated inference planning algorithms
# even though both will eventually end up planning with a horizon of 2. The sophisticated inference planning algorithm requires
# a policy length of 1 in the Agent as we specify horizon length of 2 when initializing the planning algorithm in the `rollout`.

# action_selection="deterministic" means selecting an action from the policy probability distribution (q_pi) by arg-maxxing
# sampling_mode="full" means evaluating the whole action sequence in each policy and executing the first action (as opposed to marginal where the agent evaluates each action type)
policy_len = 2
agent_vanilla = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=policy_len,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=3.0
)

agent_si = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=1,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=3.0
)

/var/folders/_f/1qqqnkyd5k5g2b1pgfwzzrqm0000gn/T/ipykernel_61048/282069077.py:8: UserWarning: A JAX array is being set as static! This can result in unexpected behavior and is usually a mistake to do.
  agent_vanilla = Agent(
/var/folders/_f/1qqqnkyd5k5g2b1pgfwzzrqm0000gn/T/ipykernel_61048/282069077.py:20: UserWarning: A JAX array is being set as static! This can result in unexpected behavior and is usually a mistake to do.
  agent_si = Agent(





Copied!







key = jr.PRNGKey(0) 
T = 3

key = jr.PRNGKey(0) 
T = 3





Copied!







key = jr.PRNGKey(0) 
T = 3

key = jr.PRNGKey(0) 
T = 3





Copied!







si_search = si_policy_search(
    horizon=policy_len, # plans 2 timesteps ahead
    max_nodes=5000, # maximum number of nodes allowed in the tree
    max_branching=10, # maximum number of children allowed per node (moderating the branching factor)
    policy_prune_threshold=0.0, # no pruning of unlikely policies
    observation_prune_threshold=0.0, # no pruning of unlikely observations
    entropy_stop_threshold=0.0, # disabling halting of expansion if agent is certain enough
    neg_efe_stop_threshold=1e10, # disabling efe value based halting of expansion
    kl_threshold=-1, # disabling node reuse if agent is in similar states after an action
    prune_penalty=512, # default value for prune penalty
    gamma=3, # temperature parameter; lower value (--> 1) prunes policies less aggressively as probabilities are flattened while higher value (--> 16) prunes more aggressively
    topk_obsspace=10000, # max number of top observation combinations - this default value just means we want to consider all the observation combinations
)

si_search = si_policy_search(
    horizon=policy_len, # plans 2 timesteps ahead
    max_nodes=5000, # maximum number of nodes allowed in the tree
    max_branching=10, # maximum number of children allowed per node (moderating the branching factor)
    policy_prune_threshold=0.0, # no pruning of unlikely policies
    observation_prune_threshold=0.0, # no pruning of unlikely observations
    entropy_stop_threshold=0.0, # disabling halting of expansion if agent is certain enough
    neg_efe_stop_threshold=1e10, # disabling efe value based halting of expansion
    kl_threshold=-1, # disabling node reuse if agent is in similar states after an action
    prune_penalty=512, # default value for prune penalty
    gamma=3, # temperature parameter; lower value (--> 1) prunes policies less aggressively as probabilities are flattened while higher value (--> 16) prunes more aggressively
    topk_obsspace=10000, # max number of top observation combinations - this default value just means we want to consider all the observation combinations
)





Copied!







si_search = si_policy_search(
    horizon=policy_len, # plans 2 timesteps ahead
    max_nodes=5000, # maximum number of nodes allowed in the tree
    max_branching=10, # maximum number of children allowed per node (moderating the branching factor)
    policy_prune_threshold=0.0, # no pruning of unlikely policies
    observation_prune_threshold=0.0, # no pruning of unlikely observations
    entropy_stop_threshold=0.0, # disabling halting of expansion if agent is certain enough
    neg_efe_stop_threshold=1e10, # disabling efe value based halting of expansion
    kl_threshold=-1, # disabling node reuse if agent is in similar states after an action
    prune_penalty=512, # default value for prune penalty
    gamma=3, # temperature parameter; lower value (--> 1) prunes policies less aggressively as probabilities are flattened while higher value (--> 16) prunes more aggressively
    topk_obsspace=10000, # max number of top observation combinations - this default value just means we want to consider all the observation combinations
)

si_search = si_policy_search(
    horizon=policy_len, # plans 2 timesteps ahead
    max_nodes=5000, # maximum number of nodes allowed in the tree
    max_branching=10, # maximum number of children allowed per node (moderating the branching factor)
    policy_prune_threshold=0.0, # no pruning of unlikely policies
    observation_prune_threshold=0.0, # no pruning of unlikely observations
    entropy_stop_threshold=0.0, # disabling halting of expansion if agent is certain enough
    neg_efe_stop_threshold=1e10, # disabling efe value based halting of expansion
    kl_threshold=-1, # disabling node reuse if agent is in similar states after an action
    prune_penalty=512, # default value for prune penalty
    gamma=3, # temperature parameter; lower value (--> 1) prunes policies less aggressively as probabilities are flattened while higher value (--> 16) prunes more aggressively
    topk_obsspace=10000, # max number of top observation combinations - this default value just means we want to consider all the observation combinations
)





Copied!







_, info_vanilla = rollout(agent_vanilla, env, num_timesteps=T, rng_key=key) # default policy search is vanilla
_, info_si = rollout(agent_si, env, num_timesteps=T, rng_key=key, policy_search=si_search)

_, info_vanilla = rollout(agent_vanilla, env, num_timesteps=T, rng_key=key) # default policy search is vanilla
_, info_si = rollout(agent_si, env, num_timesteps=T, rng_key=key, policy_search=si_search)





Copied!







_, info_vanilla = rollout(agent_vanilla, env, num_timesteps=T, rng_key=key) # default policy search is vanilla
_, info_si = rollout(agent_si, env, num_timesteps=T, rng_key=key, policy_search=si_search)

_, info_vanilla = rollout(agent_vanilla, env, num_timesteps=T, rng_key=key) # default policy search is vanilla
_, info_si = rollout(agent_si, env, num_timesteps=T, rng_key=key, policy_search=si_search)





Copied!







def make_gif(info):
    frames = []
    for t in range(info["observation"][0].shape[1]):  # iterate over timesteps
        # get observations for this timestep
        observations_t = [
            info["observation"][0][:, t, :],
            info["observation"][1][:, t, :],  
        ]
        
        frame = env.render(mode="rgb_array", observations=observations_t) # render the environment using the observations for this timestep
        frame = np.asarray(frame, dtype=np.uint8)
        plt.close()  # close the figure to prevent memory leak
        frames.append(frame)

    frames = np.array(frames, dtype=np.uint8)
    mediapy.show_video(frames, fps=1)

def make_gif(info):
    frames = []
    for t in range(info["observation"][0].shape[1]):  # iterate over timesteps
        # get observations for this timestep
        observations_t = [
            info["observation"][0][:, t, :],
            info["observation"][1][:, t, :],  
        ]
        
        frame = env.render(mode="rgb_array", observations=observations_t) # render the environment using the observations for this timestep
        frame = np.asarray(frame, dtype=np.uint8)
        plt.close()  # close the figure to prevent memory leak
        frames.append(frame)

    frames = np.array(frames, dtype=np.uint8)
    mediapy.show_video(frames, fps=1)





Copied!







def make_gif(info):
    frames = []
    for t in range(info["observation"][0].shape[1]):  # iterate over timesteps
        # get observations for this timestep
        observations_t = [
            info["observation"][0][:, t, :],
            info["observation"][1][:, t, :],  
        ]
        
        frame = env.render(mode="rgb_array", observations=observations_t) # render the environment using the observations for this timestep
        frame = np.asarray(frame, dtype=np.uint8)
        plt.close()  # close the figure to prevent memory leak
        frames.append(frame)

    frames = np.array(frames, dtype=np.uint8)
    mediapy.show_video(frames, fps=1)

def make_gif(info):
    frames = []
    for t in range(info["observation"][0].shape[1]):  # iterate over timesteps
        # get observations for this timestep
        observations_t = [
            info["observation"][0][:, t, :],
            info["observation"][1][:, t, :],  
        ]
        
        frame = env.render(mode="rgb_array", observations=observations_t) # render the environment using the observations for this timestep
        frame = np.asarray(frame, dtype=np.uint8)
        plt.close()  # close the figure to prevent memory leak
        frames.append(frame)

    frames = np.array(frames, dtype=np.uint8)
    mediapy.show_video(frames, fps=1)





Copied!







make_gif(info_vanilla)

make_gif(info_vanilla)





Copied!







make_gif(info_vanilla)

make_gif(info_vanilla)





Copied!







make_gif(info_si)

make_gif(info_si)





Copied!







make_gif(info_si)

make_gif(info_si)





Copied!







# qpi is a posterior over whole policies (action sequences).
# To get the probability of the *current* action, we marginalize over policies
# that share the same first action and sum their qpi values.
# helper functions for:
# - printing out policies and respective probabilities of selecting those policies
# - printing out action and observation info for each timestep

np.set_printoptions(precision=2, suppress=True)

def print_qpi(agent, info, print_t1=False):
    qpi_values = info["qpi"]

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
    }
    max_timesteps = 1 if print_t1 else qpi_values.shape[1]

    # unique_multiactions returns the unique first-step actions across policies
    # for a single control factor, this is just a list of action indices
    unique_actions = agent.unique_multiactions[:, 0]

    for t in range(max_timesteps):
        print(f"Timestep {t}:")
        action_probs = agent.multiaction_probabilities(qpi_values[:, t, :])[0]

        for action_idx, total_prob in zip(unique_actions.tolist(), action_probs.tolist()):
            if action_idx < 0:
                continue
            action_name = action_names.get(action_idx, f"action_{action_idx}")
            print(f"  {action_name}: {total_prob:.3f}")
        print()

def print_agent_behavior(info):

    action_names = {0: "move to center", 1: "move to left arm", 2: "move to right arm", 3: "move to cue"}

    location_obs = {0: "center loc", 1: "left arm loc", 2: "right arm loc", 3: "cue-left-arm", 4: "cue-right-arm"}
    outcome_obs = {0: "no_outcome", 1: "reward", 2: "punishment"}

    actions = info["action"]
    observations = info["observation"]

    num_timesteps = actions.shape[1]

    for t in range(num_timesteps):
        action_idx = int(actions[0, t, 0])  # [batch, timestep, action_dim]
        action_name = action_names.get(action_idx, f"action_{action_idx}")

        location_obs_idx = int(observations[0][0, t, 0])  # [modality][batch, timestep, obs_dim]
        outcome_obs_idx = int(observations[1][0, t, 0])

        location_name = location_obs.get(location_obs_idx)
        outcome_name = outcome_obs.get(outcome_obs_idx)

        print(f"t={t}: observed=({location_name}, {outcome_name}) -> action={action_name}")

# qpi is a posterior over whole policies (action sequences).
# To get the probability of the *current* action, we marginalize over policies
# that share the same first action and sum their qpi values.
# helper functions for:
# - printing out policies and respective probabilities of selecting those policies
# - printing out action and observation info for each timestep

np.set_printoptions(precision=2, suppress=True)

def print_qpi(agent, info, print_t1=False):
    qpi_values = info["qpi"]

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
    }
    max_timesteps = 1 if print_t1 else qpi_values.shape[1]

    # unique_multiactions returns the unique first-step actions across policies
    # for a single control factor, this is just a list of action indices
    unique_actions = agent.unique_multiactions[:, 0]

    for t in range(max_timesteps):
        print(f"Timestep {t}:")
        action_probs = agent.multiaction_probabilities(qpi_values[:, t, :])[0]

        for action_idx, total_prob in zip(unique_actions.tolist(), action_probs.tolist()):
            if action_idx < 0:
                continue
            action_name = action_names.get(action_idx, f"action_{action_idx}")
            print(f"  {action_name}: {total_prob:.3f}")
        print()

def print_agent_behavior(info):

    action_names = {0: "move to center", 1: "move to left arm", 2: "move to right arm", 3: "move to cue"}

    location_obs = {0: "center loc", 1: "left arm loc", 2: "right arm loc", 3: "cue-left-arm", 4: "cue-right-arm"}
    outcome_obs = {0: "no_outcome", 1: "reward", 2: "punishment"}

    actions = info["action"]
    observations = info["observation"]

    num_timesteps = actions.shape[1]

    for t in range(num_timesteps):
        action_idx = int(actions[0, t, 0])  # [batch, timestep, action_dim]
        action_name = action_names.get(action_idx, f"action_{action_idx}")

        location_obs_idx = int(observations[0][0, t, 0])  # [modality][batch, timestep, obs_dim]
        outcome_obs_idx = int(observations[1][0, t, 0])

        location_name = location_obs.get(location_obs_idx)
        outcome_name = outcome_obs.get(outcome_obs_idx)

        print(f"t={t}: observed=({location_name}, {outcome_name}) -> action={action_name}")





Copied!







# qpi is a posterior over whole policies (action sequences).
# To get the probability of the *current* action, we marginalize over policies
# that share the same first action and sum their qpi values.
# helper functions for:
# - printing out policies and respective probabilities of selecting those policies
# - printing out action and observation info for each timestep

np.set_printoptions(precision=2, suppress=True)

def print_qpi(agent, info, print_t1=False):
    qpi_values = info["qpi"]

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
    }
    max_timesteps = 1 if print_t1 else qpi_values.shape[1]

    # unique_multiactions returns the unique first-step actions across policies
    # for a single control factor, this is just a list of action indices
    unique_actions = agent.unique_multiactions[:, 0]

    for t in range(max_timesteps):
        print(f"Timestep {t}:")
        action_probs = agent.multiaction_probabilities(qpi_values[:, t, :])[0]

        for action_idx, total_prob in zip(unique_actions.tolist(), action_probs.tolist()):
            if action_idx < 0:
                continue
            action_name = action_names.get(action_idx, f"action_{action_idx}")
            print(f"  {action_name}: {total_prob:.3f}")
        print()

def print_agent_behavior(info):

    action_names = {0: "move to center", 1: "move to left arm", 2: "move to right arm", 3: "move to cue"}

    location_obs = {0: "center loc", 1: "left arm loc", 2: "right arm loc", 3: "cue-left-arm", 4: "cue-right-arm"}
    outcome_obs = {0: "no_outcome", 1: "reward", 2: "punishment"}

    actions = info["action"]
    observations = info["observation"]

    num_timesteps = actions.shape[1]

    for t in range(num_timesteps):
        action_idx = int(actions[0, t, 0])  # [batch, timestep, action_dim]
        action_name = action_names.get(action_idx, f"action_{action_idx}")

        location_obs_idx = int(observations[0][0, t, 0])  # [modality][batch, timestep, obs_dim]
        outcome_obs_idx = int(observations[1][0, t, 0])

        location_name = location_obs.get(location_obs_idx)
        outcome_name = outcome_obs.get(outcome_obs_idx)

        print(f"t={t}: observed=({location_name}, {outcome_name}) -> action={action_name}")

# qpi is a posterior over whole policies (action sequences).
# To get the probability of the *current* action, we marginalize over policies
# that share the same first action and sum their qpi values.
# helper functions for:
# - printing out policies and respective probabilities of selecting those policies
# - printing out action and observation info for each timestep

np.set_printoptions(precision=2, suppress=True)

def print_qpi(agent, info, print_t1=False):
    qpi_values = info["qpi"]

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
    }
    max_timesteps = 1 if print_t1 else qpi_values.shape[1]

    # unique_multiactions returns the unique first-step actions across policies
    # for a single control factor, this is just a list of action indices
    unique_actions = agent.unique_multiactions[:, 0]

    for t in range(max_timesteps):
        print(f"Timestep {t}:")
        action_probs = agent.multiaction_probabilities(qpi_values[:, t, :])[0]

        for action_idx, total_prob in zip(unique_actions.tolist(), action_probs.tolist()):
            if action_idx < 0:
                continue
            action_name = action_names.get(action_idx, f"action_{action_idx}")
            print(f"  {action_name}: {total_prob:.3f}")
        print()

def print_agent_behavior(info):

    action_names = {0: "move to center", 1: "move to left arm", 2: "move to right arm", 3: "move to cue"}

    location_obs = {0: "center loc", 1: "left arm loc", 2: "right arm loc", 3: "cue-left-arm", 4: "cue-right-arm"}
    outcome_obs = {0: "no_outcome", 1: "reward", 2: "punishment"}

    actions = info["action"]
    observations = info["observation"]

    num_timesteps = actions.shape[1]

    for t in range(num_timesteps):
        action_idx = int(actions[0, t, 0])  # [batch, timestep, action_dim]
        action_name = action_names.get(action_idx, f"action_{action_idx}")

        location_obs_idx = int(observations[0][0, t, 0])  # [modality][batch, timestep, obs_dim]
        outcome_obs_idx = int(observations[1][0, t, 0])

        location_name = location_obs.get(location_obs_idx)
        outcome_name = outcome_obs.get(outcome_obs_idx)

        print(f"t={t}: observed=({location_name}, {outcome_name}) -> action={action_name}")





Copied!







print_agent_behavior(info_vanilla)

print_agent_behavior(info_vanilla)

t=0: observed=(center loc, no_outcome) -> action=move to cue
t=1: observed=(cue-left-arm, no_outcome) -> action=move to left arm
t=2: observed=(left arm loc, reward) -> action=move to left arm
t=3: observed=(left arm loc, reward) -> action=move to left arm





Copied!







print_agent_behavior(info_vanilla)

print_agent_behavior(info_vanilla)

t=0: observed=(center loc, no_outcome) -> action=move to cue
t=1: observed=(cue-left-arm, no_outcome) -> action=move to left arm
t=2: observed=(left arm loc, reward) -> action=move to left arm
t=3: observed=(left arm loc, reward) -> action=move to left arm





Copied!







print_agent_behavior(info_si)

print_agent_behavior(info_si)

t=0: observed=(center loc, no_outcome) -> action=move to cue
t=1: observed=(cue-left-arm, no_outcome) -> action=move to left arm
t=2: observed=(left arm loc, reward) -> action=move to left arm
t=3: observed=(left arm loc, reward) -> action=move to left arm





Copied!







print_agent_behavior(info_si)

print_agent_behavior(info_si)

t=0: observed=(center loc, no_outcome) -> action=move to cue
t=1: observed=(cue-left-arm, no_outcome) -> action=move to left arm
t=2: observed=(left arm loc, reward) -> action=move to left arm
t=3: observed=(left arm loc, reward) -> action=move to left arm





Copied!







print_qpi(agent_vanilla, info_vanilla, print_t1=True)

print_qpi(agent_vanilla, info_vanilla, print_t1=True)

Timestep 0:
  move to center: 0.183
  move to left arm: 0.004
  move to right arm: 0.004
  move to cue: 0.809





Copied!







print_qpi(agent_vanilla, info_vanilla, print_t1=True)

print_qpi(agent_vanilla, info_vanilla, print_t1=True)

Timestep 0:
  move to center: 0.183
  move to left arm: 0.004
  move to right arm: 0.004
  move to cue: 0.809





Copied!







print_qpi(agent_si, info_si, print_t1=True)

print_qpi(agent_si, info_si, print_t1=True)

Timestep 0:
  move to center: 0.003
  move to left arm: 0.008
  move to right arm: 0.008
  move to cue: 0.980





Copied!







print_qpi(agent_si, info_si, print_t1=True)

print_qpi(agent_si, info_si, print_t1=True)

Timestep 0:
  move to center: 0.003
  move to left arm: 0.008
  move to right arm: 0.008
  move to cue: 0.980





Copied!







from pymdp.envs import TMaze

from pymdp.envs import TMaze





Copied!







from pymdp.envs import TMaze

from pymdp.envs import TMaze





Copied!







# setting the parameters for the environment
reward_condition = 0 # 0 is reward in left arm, 1 is reward in right arm, None is random allocation
cue_validity = 1.0 # 100% valid cues (cue probability)

reward_probability = 1.0 # 100% chance of reward in the correct arm
dependent_outcomes = True # if True, punishment occurs as a function of reward probability (i.e., if reward probability is 0.8, then 20% punishment). If False, punishment occurs with set probability (i.e., 20% no outcome and punishment will only occur in the other (non-rewarding) arm depending on the punishment_probability parameter)
punishment_probability = 1.0 # 100% chance of punishment in the other arm

# setting the parameters for the environment
reward_condition = 0 # 0 is reward in left arm, 1 is reward in right arm, None is random allocation
cue_validity = 1.0 # 100% valid cues (cue probability)

reward_probability = 1.0 # 100% chance of reward in the correct arm
dependent_outcomes = True # if True, punishment occurs as a function of reward probability (i.e., if reward probability is 0.8, then 20% punishment). If False, punishment occurs with set probability (i.e., 20% no outcome and punishment will only occur in the other (non-rewarding) arm depending on the punishment_probability parameter)
punishment_probability = 1.0 # 100% chance of punishment in the other arm





Copied!







# setting the parameters for the environment
reward_condition = 0 # 0 is reward in left arm, 1 is reward in right arm, None is random allocation
cue_validity = 1.0 # 100% valid cues (cue probability)

reward_probability = 1.0 # 100% chance of reward in the correct arm
dependent_outcomes = True # if True, punishment occurs as a function of reward probability (i.e., if reward probability is 0.8, then 20% punishment). If False, punishment occurs with set probability (i.e., 20% no outcome and punishment will only occur in the other (non-rewarding) arm depending on the punishment_probability parameter)
punishment_probability = 1.0 # 100% chance of punishment in the other arm

# setting the parameters for the environment
reward_condition = 0 # 0 is reward in left arm, 1 is reward in right arm, None is random allocation
cue_validity = 1.0 # 100% valid cues (cue probability)

reward_probability = 1.0 # 100% chance of reward in the correct arm
dependent_outcomes = True # if True, punishment occurs as a function of reward probability (i.e., if reward probability is 0.8, then 20% punishment). If False, punishment occurs with set probability (i.e., 20% no outcome and punishment will only occur in the other (non-rewarding) arm depending on the punishment_probability parameter)
punishment_probability = 1.0 # 100% chance of punishment in the other arm





Copied!







# initializing the environment. see tmaze.py in pymdp/envs for the implementation details
env = TMaze(
    reward_condition=reward_condition,
    cue_validity=cue_validity,  
    reward_probability=reward_probability,
    punishment_probability=punishment_probability,     
    dependent_outcomes=dependent_outcomes,
)

# initializing the environment. see tmaze.py in pymdp/envs for the implementation details
env = TMaze(
    reward_condition=reward_condition,
    cue_validity=cue_validity,  
    reward_probability=reward_probability,
    punishment_probability=punishment_probability,     
    dependent_outcomes=dependent_outcomes,
)





Copied!







# initializing the environment. see tmaze.py in pymdp/envs for the implementation details
env = TMaze(
    reward_condition=reward_condition,
    cue_validity=cue_validity,  
    reward_probability=reward_probability,
    punishment_probability=punishment_probability,     
    dependent_outcomes=dependent_outcomes,
)

# initializing the environment. see tmaze.py in pymdp/envs for the implementation details
env = TMaze(
    reward_condition=reward_condition,
    cue_validity=cue_validity,  
    reward_probability=reward_probability,
    punishment_probability=punishment_probability,     
    dependent_outcomes=dependent_outcomes,
)





Copied!







# creating C tensors filled with zeros for [location], [reward], [cue] based on A shapes for the Agent
C = [jnp.zeros(a.shape[0], dtype=jnp.float32) for a in env.A] 

# setting preferences for outcomes only
C[1] = C[1].at[1].set(2.0)    # prefer reward
C[1] = C[1].at[2].set(-6.0)   # avoid punishment

# slight cost of observing a cue
C[2] = C[2].at[1].set(-0.5) 
C[2] = C[2].at[2].set(-0.5)

# creating C tensors filled with zeros for [location], [reward], [cue] based on A shapes for the Agent
C = [jnp.zeros(a.shape[0], dtype=jnp.float32) for a in env.A] 

# setting preferences for outcomes only
C[1] = C[1].at[1].set(2.0)    # prefer reward
C[1] = C[1].at[2].set(-6.0)   # avoid punishment

# slight cost of observing a cue
C[2] = C[2].at[1].set(-0.5) 
C[2] = C[2].at[2].set(-0.5)





Copied!







# creating C tensors filled with zeros for [location], [reward], [cue] based on A shapes for the Agent
C = [jnp.zeros(a.shape[0], dtype=jnp.float32) for a in env.A] 

# setting preferences for outcomes only
C[1] = C[1].at[1].set(2.0)    # prefer reward
C[1] = C[1].at[2].set(-6.0)   # avoid punishment

# slight cost of observing a cue
C[2] = C[2].at[1].set(-0.5) 
C[2] = C[2].at[2].set(-0.5)

# creating C tensors filled with zeros for [location], [reward], [cue] based on A shapes for the Agent
C = [jnp.zeros(a.shape[0], dtype=jnp.float32) for a in env.A] 

# setting preferences for outcomes only
C[1] = C[1].at[1].set(2.0)    # prefer reward
C[1] = C[1].at[2].set(-6.0)   # avoid punishment

# slight cost of observing a cue
C[2] = C[2].at[1].set(-0.5) 
C[2] = C[2].at[2].set(-0.5)





Copied!







# D tensors [location], [reward] based on B shapes for the agent
# - agent starts in the center location
# - reward location prior is uniform
D_loc = jnp.zeros(env.B[0].shape[0], dtype=jnp.float32)
D_loc = D_loc.at[0].set(1.0)

D_reward = jnp.ones(env.B[1].shape[0], dtype=jnp.float32)
D_reward = D_reward / jnp.sum(D_reward, axis=0, keepdims=True)

D = [D_loc, D_reward]

# D tensors [location], [reward] based on B shapes for the agent
# - agent starts in the center location
# - reward location prior is uniform
D_loc = jnp.zeros(env.B[0].shape[0], dtype=jnp.float32)
D_loc = D_loc.at[0].set(1.0)

D_reward = jnp.ones(env.B[1].shape[0], dtype=jnp.float32)
D_reward = D_reward / jnp.sum(D_reward, axis=0, keepdims=True)

D = [D_loc, D_reward]





Copied!







# D tensors [location], [reward] based on B shapes for the agent
# - agent starts in the center location
# - reward location prior is uniform
D_loc = jnp.zeros(env.B[0].shape[0], dtype=jnp.float32)
D_loc = D_loc.at[0].set(1.0)

D_reward = jnp.ones(env.B[1].shape[0], dtype=jnp.float32)
D_reward = D_reward / jnp.sum(D_reward, axis=0, keepdims=True)

D = [D_loc, D_reward]

# D tensors [location], [reward] based on B shapes for the agent
# - agent starts in the center location
# - reward location prior is uniform
D_loc = jnp.zeros(env.B[0].shape[0], dtype=jnp.float32)
D_loc = D_loc.at[0].set(1.0)

D_reward = jnp.ones(env.B[1].shape[0], dtype=jnp.float32)
D_reward = D_reward / jnp.sum(D_reward, axis=0, keepdims=True)

D = [D_loc, D_reward]





Copied!







# note that we initialize agents with different policy lengths for the vanilla vs sophisticated inference planning algorithms
# even though both will eventually end up planning with a horizon of 4. The sophisticated inference planning algorithm requires
# a policy length of 1 in the Agent as we specify horizon length of 4 when initializing the planning algorithm in the `rollout`.

# action_selection="deterministic" means selecting an action from the policy probability distribution (q_pi) by arg-maxxing
# sampling_mode="full" means evaluating the whole action sequence in each policy and executing the first action (as opposed to marginal where the agent evaluates each action type)

gamma = 3.0
policy_len = 4 
agent_vanilla = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=policy_len,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=gamma,
)

agent_si = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=1,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=gamma,
)

# note that we initialize agents with different policy lengths for the vanilla vs sophisticated inference planning algorithms
# even though both will eventually end up planning with a horizon of 4. The sophisticated inference planning algorithm requires
# a policy length of 1 in the Agent as we specify horizon length of 4 when initializing the planning algorithm in the `rollout`.

# action_selection="deterministic" means selecting an action from the policy probability distribution (q_pi) by arg-maxxing
# sampling_mode="full" means evaluating the whole action sequence in each policy and executing the first action (as opposed to marginal where the agent evaluates each action type)

gamma = 3.0
policy_len = 4 
agent_vanilla = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=policy_len,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=gamma,
)

agent_si = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=1,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=gamma,
)

/var/folders/_f/1qqqnkyd5k5g2b1pgfwzzrqm0000gn/T/ipykernel_61048/2056066623.py:10: UserWarning: A JAX array is being set as static! This can result in unexpected behavior and is usually a mistake to do.
  agent_vanilla = Agent(
/var/folders/_f/1qqqnkyd5k5g2b1pgfwzzrqm0000gn/T/ipykernel_61048/2056066623.py:22: UserWarning: A JAX array is being set as static! This can result in unexpected behavior and is usually a mistake to do.
  agent_si = Agent(





Copied!







# note that we initialize agents with different policy lengths for the vanilla vs sophisticated inference planning algorithms
# even though both will eventually end up planning with a horizon of 4. The sophisticated inference planning algorithm requires
# a policy length of 1 in the Agent as we specify horizon length of 4 when initializing the planning algorithm in the `rollout`.

# action_selection="deterministic" means selecting an action from the policy probability distribution (q_pi) by arg-maxxing
# sampling_mode="full" means evaluating the whole action sequence in each policy and executing the first action (as opposed to marginal where the agent evaluates each action type)

gamma = 3.0
policy_len = 4 
agent_vanilla = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=policy_len,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=gamma,
)

agent_si = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=1,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=gamma,
)

# note that we initialize agents with different policy lengths for the vanilla vs sophisticated inference planning algorithms
# even though both will eventually end up planning with a horizon of 4. The sophisticated inference planning algorithm requires
# a policy length of 1 in the Agent as we specify horizon length of 4 when initializing the planning algorithm in the `rollout`.

# action_selection="deterministic" means selecting an action from the policy probability distribution (q_pi) by arg-maxxing
# sampling_mode="full" means evaluating the whole action sequence in each policy and executing the first action (as opposed to marginal where the agent evaluates each action type)

gamma = 3.0
policy_len = 4 
agent_vanilla = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=policy_len,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=gamma,
)

agent_si = Agent(
    env.A, env.B, C, D, 
    A_dependencies=env.A_dependencies, 
    B_dependencies=env.B_dependencies,
    policy_len=1,
    learn_A=False,
    learn_B=False,
    action_selection="deterministic",
    sampling_mode="full",
    gamma=gamma,
)

/var/folders/_f/1qqqnkyd5k5g2b1pgfwzzrqm0000gn/T/ipykernel_61048/2056066623.py:10: UserWarning: A JAX array is being set as static! This can result in unexpected behavior and is usually a mistake to do.
  agent_vanilla = Agent(
/var/folders/_f/1qqqnkyd5k5g2b1pgfwzzrqm0000gn/T/ipykernel_61048/2056066623.py:22: UserWarning: A JAX array is being set as static! This can result in unexpected behavior and is usually a mistake to do.
  agent_si = Agent(





Copied!







key = jr.PRNGKey(0) 
T = 5

key = jr.PRNGKey(0) 
T = 5





Copied!







key = jr.PRNGKey(0) 
T = 5

key = jr.PRNGKey(0) 
T = 5





Copied!







si_search = si_policy_search(
    horizon=policy_len, # plans 4 timesteps ahead
    max_nodes=5000, # maximum number of nodes allowed in the tree
    max_branching=45, # maximum number of children allowed per node (moderating the branching factor)
    policy_prune_threshold=0.0, # no pruning of unlikely policies
    observation_prune_threshold=1e-4, # no pruning of unlikely observations
    entropy_stop_threshold=0.0, # disabling halting of expansion if agent is certain enough
    neg_efe_stop_threshold=1e10, # disabling efe value based halting of expansion
    kl_threshold=-1, # disabling node reuse if agent is in similar states after an action
    prune_penalty=512, # default value for prune penalty
    gamma=gamma, # temperature parameter; lower value (---> 1) prunes policies less aggressively as probabilities are flattened while higher value (---> 16) prunes more aggressively
    topk_obsspace=10000, # max number of top observation combinations - this default value just means we want to consider all the observation combinations
)

si_search = si_policy_search(
    horizon=policy_len, # plans 4 timesteps ahead
    max_nodes=5000, # maximum number of nodes allowed in the tree
    max_branching=45, # maximum number of children allowed per node (moderating the branching factor)
    policy_prune_threshold=0.0, # no pruning of unlikely policies
    observation_prune_threshold=1e-4, # no pruning of unlikely observations
    entropy_stop_threshold=0.0, # disabling halting of expansion if agent is certain enough
    neg_efe_stop_threshold=1e10, # disabling efe value based halting of expansion
    kl_threshold=-1, # disabling node reuse if agent is in similar states after an action
    prune_penalty=512, # default value for prune penalty
    gamma=gamma, # temperature parameter; lower value (---> 1) prunes policies less aggressively as probabilities are flattened while higher value (---> 16) prunes more aggressively
    topk_obsspace=10000, # max number of top observation combinations - this default value just means we want to consider all the observation combinations
)





Copied!







si_search = si_policy_search(
    horizon=policy_len, # plans 4 timesteps ahead
    max_nodes=5000, # maximum number of nodes allowed in the tree
    max_branching=45, # maximum number of children allowed per node (moderating the branching factor)
    policy_prune_threshold=0.0, # no pruning of unlikely policies
    observation_prune_threshold=1e-4, # no pruning of unlikely observations
    entropy_stop_threshold=0.0, # disabling halting of expansion if agent is certain enough
    neg_efe_stop_threshold=1e10, # disabling efe value based halting of expansion
    kl_threshold=-1, # disabling node reuse if agent is in similar states after an action
    prune_penalty=512, # default value for prune penalty
    gamma=gamma, # temperature parameter; lower value (---> 1) prunes policies less aggressively as probabilities are flattened while higher value (---> 16) prunes more aggressively
    topk_obsspace=10000, # max number of top observation combinations - this default value just means we want to consider all the observation combinations
)

si_search = si_policy_search(
    horizon=policy_len, # plans 4 timesteps ahead
    max_nodes=5000, # maximum number of nodes allowed in the tree
    max_branching=45, # maximum number of children allowed per node (moderating the branching factor)
    policy_prune_threshold=0.0, # no pruning of unlikely policies
    observation_prune_threshold=1e-4, # no pruning of unlikely observations
    entropy_stop_threshold=0.0, # disabling halting of expansion if agent is certain enough
    neg_efe_stop_threshold=1e10, # disabling efe value based halting of expansion
    kl_threshold=-1, # disabling node reuse if agent is in similar states after an action
    prune_penalty=512, # default value for prune penalty
    gamma=gamma, # temperature parameter; lower value (---> 1) prunes policies less aggressively as probabilities are flattened while higher value (---> 16) prunes more aggressively
    topk_obsspace=10000, # max number of top observation combinations - this default value just means we want to consider all the observation combinations
)





Copied!







_, info_vanilla = rollout(agent_vanilla, env, num_timesteps=T, rng_key=key) # default policy search is vanilla
_, info_si = rollout(agent_si, env, num_timesteps=T, rng_key=key, policy_search=si_search)

_, info_vanilla = rollout(agent_vanilla, env, num_timesteps=T, rng_key=key) # default policy search is vanilla
_, info_si = rollout(agent_si, env, num_timesteps=T, rng_key=key, policy_search=si_search)





Copied!







_, info_vanilla = rollout(agent_vanilla, env, num_timesteps=T, rng_key=key) # default policy search is vanilla
_, info_si = rollout(agent_si, env, num_timesteps=T, rng_key=key, policy_search=si_search)

_, info_vanilla = rollout(agent_vanilla, env, num_timesteps=T, rng_key=key) # default policy search is vanilla
_, info_si = rollout(agent_si, env, num_timesteps=T, rng_key=key, policy_search=si_search)





Copied!







def make_gif(info):
    frames = []
    for t in range(info["observation"][0].shape[1]):  # iterate over timesteps
        # get observations for this timestep
        observations_t = [
            info["observation"][0][:, t, :],
            info["observation"][1][:, t, :],  
            info["observation"][2][:, t, :],
        ]
        
        frame = env.render(mode="rgb_array", observations=observations_t) # render the environment using the observations for this timestep
        frame = np.asarray(frame, dtype=np.uint8)
        plt.close()  # close the figure to prevent memory leak
        frames.append(frame)

    frames = np.array(frames, dtype=np.uint8)
    mediapy.show_video(frames, fps=1)

def make_gif(info):
    frames = []
    for t in range(info["observation"][0].shape[1]):  # iterate over timesteps
        # get observations for this timestep
        observations_t = [
            info["observation"][0][:, t, :],
            info["observation"][1][:, t, :],  
            info["observation"][2][:, t, :],
        ]
        
        frame = env.render(mode="rgb_array", observations=observations_t) # render the environment using the observations for this timestep
        frame = np.asarray(frame, dtype=np.uint8)
        plt.close()  # close the figure to prevent memory leak
        frames.append(frame)

    frames = np.array(frames, dtype=np.uint8)
    mediapy.show_video(frames, fps=1)





Copied!







def make_gif(info):
    frames = []
    for t in range(info["observation"][0].shape[1]):  # iterate over timesteps
        # get observations for this timestep
        observations_t = [
            info["observation"][0][:, t, :],
            info["observation"][1][:, t, :],  
            info["observation"][2][:, t, :],
        ]
        
        frame = env.render(mode="rgb_array", observations=observations_t) # render the environment using the observations for this timestep
        frame = np.asarray(frame, dtype=np.uint8)
        plt.close()  # close the figure to prevent memory leak
        frames.append(frame)

    frames = np.array(frames, dtype=np.uint8)
    mediapy.show_video(frames, fps=1)

def make_gif(info):
    frames = []
    for t in range(info["observation"][0].shape[1]):  # iterate over timesteps
        # get observations for this timestep
        observations_t = [
            info["observation"][0][:, t, :],
            info["observation"][1][:, t, :],  
            info["observation"][2][:, t, :],
        ]
        
        frame = env.render(mode="rgb_array", observations=observations_t) # render the environment using the observations for this timestep
        frame = np.asarray(frame, dtype=np.uint8)
        plt.close()  # close the figure to prevent memory leak
        frames.append(frame)

    frames = np.array(frames, dtype=np.uint8)
    mediapy.show_video(frames, fps=1)





Copied!







make_gif(info_vanilla)

make_gif(info_vanilla)





Copied!







make_gif(info_vanilla)

make_gif(info_vanilla)





Copied!







make_gif(info_si)

make_gif(info_si)





Copied!







make_gif(info_si)

make_gif(info_si)





Copied!







# qpi is a posterior over whole policies (action sequences).
# To get the probability of the *current* action, we marginalize over policies
# that share the same first action and sum their qpi values.
# helper functions for:
# - printing out policies and respective probabilities of selecting those policies
# - printing out action and observation info for each timestep

np.set_printoptions(precision=2, suppress=True)

def print_qpi(agent, info, print_t1=False):
    qpi_values = info["qpi"]

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
        4: "move to middle",
    }
    max_timesteps = 1 if print_t1 else qpi_values.shape[1]

    # unique_multiactions returns the unique first-step actions across policies
    # for a single control factor, this is just a list of action indices
    unique_actions = agent.unique_multiactions[:, 0]

    for t in range(max_timesteps):
        print(f"Timestep {t}:")
        action_probs = agent.multiaction_probabilities(qpi_values[:, t, :])[0]

        for action_idx, total_prob in zip(unique_actions.tolist(), action_probs.tolist()):
            if action_idx < 0:
                continue
            action_name = action_names.get(action_idx, f"action_{action_idx}")
            print(f"  {action_name}: {total_prob:.3f}")
        print()

def print_agent_behavior(info):

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
        4: "move to middle",
    }
    
    location_obs = {
        0: "center loc",
        1: "left arm loc",
        2: "right arm loc",
        3: "cue loc",
        4: "middle loc",
    }
    outcome_obs = {0: "no_outcome", 1: "reward", 2: "punishment"}
    cue_obs = {0: "no cue", 1: "cue-left", 2: "cue-right"}
    
    actions = info["action"]
    observations = info["observation"]
    
    num_timesteps = actions.shape[1]
    
    for t in range(num_timesteps):
        action_idx = int(actions[0, t, 0])  # [batch, timestep, action_dim]
        action_name = action_names.get(action_idx, f"action_{action_idx}")
        
        location_obs_idx = int(observations[0][0, t, 0])  # [modality][batch, timestep, obs_dim]
        outcome_obs_idx = int(observations[1][0, t, 0])
        cue_obs_idx = int(observations[2][0, t, 0])
        
        location_name = location_obs.get(location_obs_idx)
        outcome_name = outcome_obs.get(outcome_obs_idx)
        cue_name = cue_obs.get(cue_obs_idx)
        
        print(f"t={t}: observed=({location_name}, {outcome_name}, {cue_name}) -> action={action_name}")

# qpi is a posterior over whole policies (action sequences).
# To get the probability of the *current* action, we marginalize over policies
# that share the same first action and sum their qpi values.
# helper functions for:
# - printing out policies and respective probabilities of selecting those policies
# - printing out action and observation info for each timestep

np.set_printoptions(precision=2, suppress=True)

def print_qpi(agent, info, print_t1=False):
    qpi_values = info["qpi"]

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
        4: "move to middle",
    }
    max_timesteps = 1 if print_t1 else qpi_values.shape[1]

    # unique_multiactions returns the unique first-step actions across policies
    # for a single control factor, this is just a list of action indices
    unique_actions = agent.unique_multiactions[:, 0]

    for t in range(max_timesteps):
        print(f"Timestep {t}:")
        action_probs = agent.multiaction_probabilities(qpi_values[:, t, :])[0]

        for action_idx, total_prob in zip(unique_actions.tolist(), action_probs.tolist()):
            if action_idx < 0:
                continue
            action_name = action_names.get(action_idx, f"action_{action_idx}")
            print(f"  {action_name}: {total_prob:.3f}")
        print()

def print_agent_behavior(info):

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
        4: "move to middle",
    }
    
    location_obs = {
        0: "center loc",
        1: "left arm loc",
        2: "right arm loc",
        3: "cue loc",
        4: "middle loc",
    }
    outcome_obs = {0: "no_outcome", 1: "reward", 2: "punishment"}
    cue_obs = {0: "no cue", 1: "cue-left", 2: "cue-right"}
    
    actions = info["action"]
    observations = info["observation"]
    
    num_timesteps = actions.shape[1]
    
    for t in range(num_timesteps):
        action_idx = int(actions[0, t, 0])  # [batch, timestep, action_dim]
        action_name = action_names.get(action_idx, f"action_{action_idx}")
        
        location_obs_idx = int(observations[0][0, t, 0])  # [modality][batch, timestep, obs_dim]
        outcome_obs_idx = int(observations[1][0, t, 0])
        cue_obs_idx = int(observations[2][0, t, 0])
        
        location_name = location_obs.get(location_obs_idx)
        outcome_name = outcome_obs.get(outcome_obs_idx)
        cue_name = cue_obs.get(cue_obs_idx)
        
        print(f"t={t}: observed=({location_name}, {outcome_name}, {cue_name}) -> action={action_name}")





Copied!







# qpi is a posterior over whole policies (action sequences).
# To get the probability of the *current* action, we marginalize over policies
# that share the same first action and sum their qpi values.
# helper functions for:
# - printing out policies and respective probabilities of selecting those policies
# - printing out action and observation info for each timestep

np.set_printoptions(precision=2, suppress=True)

def print_qpi(agent, info, print_t1=False):
    qpi_values = info["qpi"]

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
        4: "move to middle",
    }
    max_timesteps = 1 if print_t1 else qpi_values.shape[1]

    # unique_multiactions returns the unique first-step actions across policies
    # for a single control factor, this is just a list of action indices
    unique_actions = agent.unique_multiactions[:, 0]

    for t in range(max_timesteps):
        print(f"Timestep {t}:")
        action_probs = agent.multiaction_probabilities(qpi_values[:, t, :])[0]

        for action_idx, total_prob in zip(unique_actions.tolist(), action_probs.tolist()):
            if action_idx < 0:
                continue
            action_name = action_names.get(action_idx, f"action_{action_idx}")
            print(f"  {action_name}: {total_prob:.3f}")
        print()

def print_agent_behavior(info):

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
        4: "move to middle",
    }
    
    location_obs = {
        0: "center loc",
        1: "left arm loc",
        2: "right arm loc",
        3: "cue loc",
        4: "middle loc",
    }
    outcome_obs = {0: "no_outcome", 1: "reward", 2: "punishment"}
    cue_obs = {0: "no cue", 1: "cue-left", 2: "cue-right"}
    
    actions = info["action"]
    observations = info["observation"]
    
    num_timesteps = actions.shape[1]
    
    for t in range(num_timesteps):
        action_idx = int(actions[0, t, 0])  # [batch, timestep, action_dim]
        action_name = action_names.get(action_idx, f"action_{action_idx}")
        
        location_obs_idx = int(observations[0][0, t, 0])  # [modality][batch, timestep, obs_dim]
        outcome_obs_idx = int(observations[1][0, t, 0])
        cue_obs_idx = int(observations[2][0, t, 0])
        
        location_name = location_obs.get(location_obs_idx)
        outcome_name = outcome_obs.get(outcome_obs_idx)
        cue_name = cue_obs.get(cue_obs_idx)
        
        print(f"t={t}: observed=({location_name}, {outcome_name}, {cue_name}) -> action={action_name}")

# qpi is a posterior over whole policies (action sequences).
# To get the probability of the *current* action, we marginalize over policies
# that share the same first action and sum their qpi values.
# helper functions for:
# - printing out policies and respective probabilities of selecting those policies
# - printing out action and observation info for each timestep

np.set_printoptions(precision=2, suppress=True)

def print_qpi(agent, info, print_t1=False):
    qpi_values = info["qpi"]

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
        4: "move to middle",
    }
    max_timesteps = 1 if print_t1 else qpi_values.shape[1]

    # unique_multiactions returns the unique first-step actions across policies
    # for a single control factor, this is just a list of action indices
    unique_actions = agent.unique_multiactions[:, 0]

    for t in range(max_timesteps):
        print(f"Timestep {t}:")
        action_probs = agent.multiaction_probabilities(qpi_values[:, t, :])[0]

        for action_idx, total_prob in zip(unique_actions.tolist(), action_probs.tolist()):
            if action_idx < 0:
                continue
            action_name = action_names.get(action_idx, f"action_{action_idx}")
            print(f"  {action_name}: {total_prob:.3f}")
        print()

def print_agent_behavior(info):

    action_names = {
        0: "move to center",
        1: "move to left arm",
        2: "move to right arm",
        3: "move to cue",
        4: "move to middle",
    }
    
    location_obs = {
        0: "center loc",
        1: "left arm loc",
        2: "right arm loc",
        3: "cue loc",
        4: "middle loc",
    }
    outcome_obs = {0: "no_outcome", 1: "reward", 2: "punishment"}
    cue_obs = {0: "no cue", 1: "cue-left", 2: "cue-right"}
    
    actions = info["action"]
    observations = info["observation"]
    
    num_timesteps = actions.shape[1]
    
    for t in range(num_timesteps):
        action_idx = int(actions[0, t, 0])  # [batch, timestep, action_dim]
        action_name = action_names.get(action_idx, f"action_{action_idx}")
        
        location_obs_idx = int(observations[0][0, t, 0])  # [modality][batch, timestep, obs_dim]
        outcome_obs_idx = int(observations[1][0, t, 0])
        cue_obs_idx = int(observations[2][0, t, 0])
        
        location_name = location_obs.get(location_obs_idx)
        outcome_name = outcome_obs.get(outcome_obs_idx)
        cue_name = cue_obs.get(cue_obs_idx)
        
        print(f"t={t}: observed=({location_name}, {outcome_name}, {cue_name}) -> action={action_name}")





Copied!







print_agent_behavior(info_vanilla)

print_agent_behavior(info_vanilla)

t=0: observed=(center loc, no_outcome, no cue) -> action=move to cue
t=1: observed=(cue loc, no_outcome, cue-left) -> action=move to center
t=2: observed=(center loc, no_outcome, no cue) -> action=move to middle
t=3: observed=(middle loc, no_outcome, no cue) -> action=move to left arm
t=4: observed=(left arm loc, reward, no cue) -> action=move to center
t=5: observed=(left arm loc, reward, no cue) -> action=move to center





Copied!







print_agent_behavior(info_vanilla)

print_agent_behavior(info_vanilla)

t=0: observed=(center loc, no_outcome, no cue) -> action=move to cue
t=1: observed=(cue loc, no_outcome, cue-left) -> action=move to center
t=2: observed=(center loc, no_outcome, no cue) -> action=move to middle
t=3: observed=(middle loc, no_outcome, no cue) -> action=move to left arm
t=4: observed=(left arm loc, reward, no cue) -> action=move to center
t=5: observed=(left arm loc, reward, no cue) -> action=move to center





Copied!







print_agent_behavior(info_si)

print_agent_behavior(info_si)

t=0: observed=(center loc, no_outcome, no cue) -> action=move to cue
t=1: observed=(cue loc, no_outcome, cue-left) -> action=move to center
t=2: observed=(center loc, no_outcome, no cue) -> action=move to middle
t=3: observed=(middle loc, no_outcome, no cue) -> action=move to left arm
t=4: observed=(left arm loc, reward, no cue) -> action=move to center
t=5: observed=(left arm loc, reward, no cue) -> action=move to center





Copied!







print_agent_behavior(info_si)

print_agent_behavior(info_si)

t=0: observed=(center loc, no_outcome, no cue) -> action=move to cue
t=1: observed=(cue loc, no_outcome, cue-left) -> action=move to center
t=2: observed=(center loc, no_outcome, no cue) -> action=move to middle
t=3: observed=(middle loc, no_outcome, no cue) -> action=move to left arm
t=4: observed=(left arm loc, reward, no cue) -> action=move to center
t=5: observed=(left arm loc, reward, no cue) -> action=move to center





Copied!







print_qpi(agent_vanilla, info_vanilla, print_t1=True)

print_qpi(agent_vanilla, info_vanilla, print_t1=True)

Timestep 0:
  move to center: 0.142
  move to left arm: 0.142
  move to right arm: 0.142
  move to cue: 0.539
  move to middle: 0.036





Copied!







print_qpi(agent_vanilla, info_vanilla, print_t1=True)

print_qpi(agent_vanilla, info_vanilla, print_t1=True)

Timestep 0:
  move to center: 0.142
  move to left arm: 0.142
  move to right arm: 0.142
  move to cue: 0.539
  move to middle: 0.036





Copied!







print_qpi(agent_si, info_si, print_t1=True)

print_qpi(agent_si, info_si, print_t1=True)

Timestep 0:
  move to center: 0.001
  move to left arm: 0.001
  move to right arm: 0.001
  move to cue: 0.810
  move to middle: 0.186





Copied!







print_qpi(agent_si, info_si, print_t1=True)

print_qpi(agent_si, info_si, print_t1=True)

Timestep 0:
  move to center: 0.001
  move to left arm: 0.001
  move to right arm: 0.001
  move to cue: 0.810
  move to middle: 0.186

Validating Sophisticated Inference (SI) Planning Algorithm using the T-Maze Task¶

Overview¶

The T-Maze Task¶

Notebook Structure¶

Setting up the T-Maze environment (Generative Process)¶

States and Observations¶

Environment Parameters¶

Setting up the Agents¶

Running the active inference rollouts¶

Result analysis¶

Doing the same experiment but with the extended`TMaze` environment used in other demos¶

Setting up the T-Maze environment (Generative Process)¶

States and Observations¶

Environment Parameters¶

Setting up the Agents¶

Running the active inference rollouts¶

Result analysis¶

Validating Sophisticated Inference (SI) Planning Algorithm using the T-Maze Task¶

Overview¶

The T-Maze Task¶

Notebook Structure¶

Setting up the T-Maze environment (Generative Process)¶

States and Observations¶

Environment Parameters¶

Setting up the Agents¶

Running the active inference rollouts¶

Result analysis¶

Doing the same experiment but with the extendedTMaze environment used in other demos¶

Setting up the T-Maze environment (Generative Process)¶

States and Observations¶

Environment Parameters¶

Setting up the Agents¶

Running the active inference rollouts¶

Result analysis¶

Doing the same experiment but with the extended`TMaze` environment used in other demos¶