Test your AI Agents with the all-new Agent to Agent Testing Platform.Learn More

How to use ne_strategy method in pandera

Best Python code snippet using pandera_python

nash_dqn_speed.py

Source:nash_dqn_speed.py

1import torch2import torch.nn as nn3import torch.nn.functional as F4import numpy as np5import gym6import operator7import random, copy8import pickle9from ..common.nn_components import cReLU, Flatten10from ..common.storage import ReplayBuffer11from ..common.rl_utils import choose_optimizer, EpsilonScheduler12from ..common.networks import NetBase, get_model13from .dqn import DQN, DQNBase14from mars.equilibrium_solver import NashEquilibriumECOSSolver15DEBUG = False16def kl(p, q):17    """Kullback-Leibler divergence D(P || Q) for discrete distributions18    Parameters19    ----------20    p, q : array-like, dtype=float, shape=n21    Discrete probability distributions.22    """23    p = np.asarray(p, dtype=np.float)24    q = np.asarray(q, dtype=np.float)25    return np.sum(np.where(p != 0, p * np.log(p / q), 0))26class Debugger():27    def __init__(self, env, log_path = None):28        self.env = env29        if env.OneHotObs:30            self.num_states_per_step = int(self.env.observation_space.shape[0])31        else:32            self.num_states_per_step = int(self.env.observation_space.high[0]/(self.env.max_transition+1))33        self.max_transition = env.max_transition34        self.kl_dist_list=[[] for _ in range(self.max_transition)]35        self.mse_v_list=[[] for _ in range(self.max_transition)]36        self.mse_exp_list=[[] for _ in range(self.max_transition)]37        self.cnt = 038        self.save_interval = 1039        self.logging = {'num_states_per_step': self.num_states_per_step,40                        'max_transition': self.max_transition,41                        'cnt': [],42                        'state_visit': {},43                        'kl_nash_dist': [],44                        'mse_nash_v': [],45                        'mse_exploitability': []46                        }47        self.log_path = log_path 48        self.state_list = []49        self.oracle_nash_strategies = np.vstack(self.env.Nash_strategies) # flatten to shape dim 150        self.oracle_nash_values = np.concatenate(self.env.Nash_v) # flatten to shape dim 151        self.oracle_nash_q_values = np.concatenate(self.env.Nash_q) # flatten to shape dim 152    def compare_with_oracle(self, state, dists, ne_vs, verbose=False):53        """[summary]54        :param state: current state55        :type state: [type]56        :param dists: predicted Nash strategies (distributions)57        :type dists: [type]58        :param ne_vs: predicted Nash equilibrium values based on predicted Nash strategies59        :type ne_vs: [type]60        :param verbose: [description], defaults to False61        :type verbose: bool, optional62        """63        self.cnt+=164        if self.env.OneHotObs:65            state_ = state[0].cpu().numpy()66            id_state = np.where(state_>0)[0][0]67        else:68            id_state =  int(torch.sum(state).cpu().numpy()/2)69        for j in range(self.max_transition):  # nash value for non-terminal states (before the final timestep)70            if id_state >= j*self.num_states_per_step and id_state < (j+1)*self.num_states_per_step:  # determine which timestep is current state71                ne_strategy = self.oracle_nash_strategies[id_state]72                ne_v = self.oracle_nash_values[id_state]73                ne_q = self.oracle_nash_q_values[id_state]74                oracle_first_player_ne_strategy = ne_strategy[0]75                nash_dqn_first_player_ne_strategy = dists[0][0]76                br_v = np.min(nash_dqn_first_player_ne_strategy@ne_q)  # best response value (value against best response), reflects exploitability of learned Nash 77                kl_dist = kl(oracle_first_player_ne_strategy, nash_dqn_first_player_ne_strategy)78                self.kl_dist_list[j].append(kl_dist)79                mse_v = float((ne_v - ne_vs)**2) # squared error of Nash values (predicted and oracle)80                self.mse_v_list[j].append(mse_v)81                mse_exp = float((ne_v - br_v)**2)  # the target value of best response value (exploitability) should be the Nash value82                self.mse_exp_list[j].append(mse_exp)83        self.state_visit(id_state)84        self.log([id_state, kl_dist, ne_vs], verbose)85        if self.cnt % self.save_interval == 0:86            self.dump_log()87    def state_visit(self, state):88        self.state_list.append(state)89    def log(self, data, verbose=False):90        # get state visitation statistics91        unique, counts = np.unique(self.state_list, return_counts=True)92        state_stat = dict(zip(unique, counts))93        if verbose:94            print('state index: {}ï¼ KL: {}'.format(*data))95            print('state visitation counts: {}'.format(state_stat))96        self.logging['cnt'].append(self.cnt)97        self.logging['state_visit'] = state_stat98        self.logging['kl_nash_dist'] = self.kl_dist_list99        self.logging['mse_nash_v'] = self.mse_v_list100        self.logging['mse_exploitability'] = self.mse_exp_list101    def dump_log(self,):102        with open(self.log_path, "wb") as f:103            pickle.dump(self.logging, f)    104class NashDQNSpeed(DQN):105    """106    Nash-DQN algorithm107    """108    def __init__(self, env, args):109        super().__init__(env, args)110        self.num_envs = args.num_envs111        self.model = NashDQNBase(env, args.net_architecture, args.num_envs, two_side_obs = args.marl_spec['global_state']).to(self.device)112        self.target = copy.deepcopy(self.model).to(self.device)113        114        if args.num_process > 1:115            self.model.share_memory()116            self.target.share_memory()117        self.num_agents = env.num_agents[0] if isinstance(env.num_agents, list) else env.num_agents118        try:119            self.action_dims = env.action_space[0].n120        except:121            self.action_dims = env.action_space.n122        # don't forget to instantiate an optimizer although there is one in DQN123        self.optimizer = choose_optimizer(args.optimizer)(self.model.parameters(), lr=float(args.learning_rate))124        # lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=50, gamma=0.95)    125        # self.schedulers.append(lr_scheduler)126        if DEBUG:127            self.debugger = Debugger(env, "./data/nash_dqn_simple_mdp_log.pkl")128        self.warm_up = 500*2000  # ~ 5000 episodes, b.c. 0.1 update freq, ~2000 episode length ;warm-up steps use non-Nash update manner129    def choose_action(self, state, Greedy=False, epsilon=None):130        if Greedy:131            epsilon = 0.132        elif epsilon is None:133            epsilon = self.epsilon_scheduler.get_epsilon()134        if not isinstance(state, torch.Tensor):135            state = torch.Tensor(state).to(self.device)136        if self.num_envs == 1: # state: (agents, state_dim)137            state = state.unsqueeze(0).view(1, -1) # change state from (agents, state_dim) to (1, agents*state_dim)138        else: # state: (agents, envs, state_dim)139            state = torch.transpose(state, 0, 1) # to state: (envs, agents, state_dim)140            state = state.view(state.shape[0], -1) # to state: (envs, agents*state_dim)141        if random.random() > epsilon:  # NoisyNet does not use e-greedy142            with torch.no_grad():143                q_values = self.model(state).detach().cpu().numpy()  # needs state: (batch, agents*state_dim)144            145            if self.update_cnt < self.warm_up:146                q_tables = q_values.reshape(-1, self.action_dims,  self.action_dims)147                actions = []148                for qt in q_tables:149                    row_q = np.average(qt, axis=-1)150                    col_q = np.average(qt.T, axis=-1)151                    actions.append([np.argmax(row_q), np.argmin(col_q)])152            else:153                # if self.args.cce:154                #     actions = self.compute_cce(q_values)155                # else:156                actions, dists, ne_vs = self.compute_nash(q_values) 157                if DEBUG: ## test on arbitrary MDP158                    self.debugger.compare_with_oracle(state, dists, ne_vs, verbose=True)159        else:160            actions = np.random.randint(self.action_dims, size=(state.shape[0], self.num_agents))  # (envs, agents)161        162        if self.num_envs == 1:163            actions = actions[0]  # list of actions to its item164        else:165            actions = np.array(actions).T  # to shape: (agents, envs, action_dim)166        return actions167    def compute_nash(self, q_values, return_dist_only=False):168        """169        Return actions as Nash equilibrium of given payoff matrix, shape: [env, agent]170        """171        q_tables = q_values.reshape(-1, self.action_dims,  self.action_dims)172        all_actions = []173        all_dists = []174        all_ne_values = []175        for qs in q_tables:  # iterate over envs176            # Solve Nash equilibrium with solver177            try:178                # ne = NashEquilibriaSolver(qs)179                # ne = ne[0]  # take the first Nash equilibria found180                # print(np.linalg.det(qs))181                # ne = NashEquilibriumSolver(qs)182                # ne = NashEquilibriumLPSolver(qs)183                # ne = NashEquilibriumCVXPYSolver(qs)184                # ne = NashEquilibriumGUROBISolver(qs)185                ne, ne_v = NashEquilibriumECOSSolver(qs)186            except:  # some cases NE cannot be solved187                print('No Nash solution for: ', np.linalg.det(qs), qs)188                ne = self.num_agents*[1./qs.shape[0]*np.ones(qs.shape[0])]  # use uniform distribution if no NE is found189                ne_v = 0190            all_dists.append(ne)191            all_ne_values.append(ne_v)192            # Sample actions from Nash strategies193            actions = []194            for dist in ne:  # iterate over agents195                try:196                    sample_hist = np.random.multinomial(1, dist)  # return one-hot vectors as sample from multinomial197                except:198                    print('Not a valid distribution from Nash equilibrium solution.')199                    print(sum(ne[0]), sum(ne[1]))200                    print(qs, ne)201                    print(dist)202                a = np.where(sample_hist>0)203                actions.append(a)204            all_actions.append(np.array(actions).reshape(-1))205        if return_dist_only:206            return all_dists207        else: # return samples actions, nash strategies, nash values208            return np.array(all_actions), all_dists, all_ne_values209    def compute_cce(self, q_values, return_dist=False):210        """211        Return actions as coarse correlated equilibrium of given payoff matrix, shape: [env, agent]212        """213        q_tables = q_values.reshape(-1, self.action_dims,  self.action_dims)214        all_actions = []215        all_dists = []216        for qs in q_tables:  # iterate over envs217            try:218                _, _, jnt_probs = CoarseCorrelatedEquilibriumLPSolver(qs)219            except:  # some cases NE cannot be solved220                print('No CCE solution for: ', np.linalg.det(qs), qs)221                jnt_probs = 1./(qs.shape[0]*qs.shape[1])*np.ones(qs.shape[0]*qs.shape[1])  # use uniform distribution if no NE is found222            223            try:224                sample_hist = np.random.multinomial(1, jnt_probs)  # a joint probability matrix for all players225            except:226                print('Not a valid distribution from Nash equilibrium solution.')227                print(sum(jnt_probs), sum(abs(jnt_probs)))228                print(qs, jnt_probs)229            sample_hist = sample_hist.reshape(self.action_dims,  self.action_dims)230            a = np.where(sample_hist>0)  # the actions for two players231            all_actions.append(np.array(a).reshape(-1))232            all_dists.append(jnt_probs)233        if return_dist:234            return all_dists235        else:236            return np.array(all_actions)237    def update(self):238        state, action, reward, next_state, done = self.buffer.sample(self.batch_size)239        state = torch.FloatTensor(np.float32(state)).to(self.device)240        next_state = torch.FloatTensor(np.float32(next_state)).to(self.device)241        action = torch.FloatTensor(action).to(self.device)242        reward = torch.FloatTensor(reward).to(self.device)243        done = torch.FloatTensor(np.float32(done)).to(self.device)244        # Q-Learning with target network245        q_values = self.model(state)246        target_next_q_values_ = self.target(next_state)247        target_next_q_values = target_next_q_values_.detach().cpu().numpy()248        action_dim = int(np.sqrt(q_values.shape[-1])) # for two-symmetric-agent case only249        action_ = torch.LongTensor([a[0]*action_dim+a[1] for a in action]).to(self.device)250        q_value = q_values.gather(1, action_.unsqueeze(1)).squeeze(1)251        # compute CCE or NE252        # if args.cce: # Coarse Correlated Equilibrium253        #     cce_dists = self.compute_cce(target_next_q_values, return_dist=True)254        #     target_next_q_values_ = target_next_q_values_.reshape(-1, action_dim, action_dim)255        #     cce_dists_  = torch.FloatTensor(cce_dists).to(self.device)256        #     next_q_value = torch.einsum('bij,bij->b', cce_dists_, target_next_q_values_)257        # else: # Nash Equilibrium258        if self.update_cnt < self.warm_up:259            expected_q_value = reward260        else:261            nash_dists = self.compute_nash(target_next_q_values, return_dist_only=True)  # get the mixed strategy Nash rather than specific actions262            target_next_q_values_ = target_next_q_values_.reshape(-1, action_dim, action_dim)263            nash_dists_  = torch.FloatTensor(nash_dists).to(self.device)264            next_q_value = torch.einsum('bk,bk->b', torch.einsum('bj,bjk->bk', nash_dists_[:, 0], target_next_q_values_), nash_dists_[:, 1])265            266            expected_q_value = reward + (self.gamma ** self.multi_step) * next_q_value * (1 - done)            267        # Huber Loss268        # loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none')269        loss = F.mse_loss(q_value, expected_q_value.detach())270        loss = loss.mean()271        self.optimizer.zero_grad()272        loss.backward()273        self.optimizer.step()274        if self.update_cnt % self.target_update_interval == 0:275            self.update_target(self.model, self.target)276            # self.update_cnt = 0277        self.update_cnt += 1278        return loss.item()279class NashDQNBase(DQNBase):280    """281    Nash-DQN for parallel env sampling282    parameters283    ---------284    env         environment(openai gym)285    """286    def __init__(self, env, net_args, number_envs=2, two_side_obs=True):287        super().__init__(env, net_args)288        self.number_envs = number_envs289        try:290            if two_side_obs:291                self._observation_shape = tuple(map(operator.add, env.observation_space.shape, env.observation_space.shape)) # double the shape292            else:293                self._observation_shape = env.observation_space.shape294            self._action_shape = (env.action_space.n)**2295        except:296            if two_side_obs:297                self._observation_shape = tuple(map(operator.add, env.observation_space[0].shape, env.observation_space[0].shape)) # double the shape298            else:299                self._observation_shape = env.observation_space[0].shape300            self._action_shape = (env.action_space[0].n)**2301        self._construct_net(env, net_args)302    def _construct_net(self, env, net_args):303            input_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape = self._observation_shape)304            output_space = gym.spaces.Discrete(self._action_shape)305            if len(self._observation_shape) <= 1: # not image306                self.net = get_model('mlp')(input_space, output_space, net_args, model_for='discrete_q')307            else:...

blockwithholding.py

Source:blockwithholding.py

1from __future__ import absolute_import2from __future__ import division3from __future__ import print_function4import random5import numpy as np6import argparse7from gym import spaces8import ray9from ray.tune.registry import register_env10from ray.rllib.models.preprocessors import get_preprocessor11from ray import tune12from ray.rllib.agents.pg.pg import PGTrainer13from ray.rllib.agents.pg.pg_policy import PGTFPolicy14from ray.rllib.policy.policy import Policy15from ray.rllib.env.multi_agent_env import MultiAgentEnv16from ray.rllib.utils import try_import_tf17from ray.tune.util import flatten_dict18from ray.tune.result import (NODE_IP, TRAINING_ITERATION, TIME_TOTAL_S,19                             TIMESTEPS_TOTAL, EXPR_PARAM_FILE,20                             EXPR_PARAM_PICKLE_FILE, EXPR_PROGRESS_FILE,21                             EXPR_RESULT_FILE)22from ray.tune.logger import pretty_print23import os24import csv25import mdptoolbox26import pandas as pd27import math28import sympy as sym29tf = try_import_tf()30CLI = argparse.ArgumentParser()31CLI.add_argument(32    "--alphas",33    nargs = '*',34    type = float,35    default = [.4, .5]36)37CLI.add_argument(38    "--impact",39    type = float,40    default = .241)42CLI.add_argument(43    "--threshold",44    type = float,45    default = .0246)47CLI.add_argument(48    '--algo',49    type = str,50    default = 'PPO'51)52CLI.add_argument(53    '--use_lstm',54    type = bool,55    default = False56)57CLI.add_argument(58    '--gamma',59    type = float,60    default = 0.9961)62CLI.add_argument(63    '--lr',64    type = float,65    default = 1e-666)67CLI.add_argument(68    '--lmbda',69    type = float,70    default = 1.071)72CLI.add_argument(73    '--iteration',74    type = int,75    default = 1076)77CLI.add_argument(78    '--episodes',79    type = int,80    default = 1e681)82CLI.add_argument(83    '--ep_length',84    type = int,85    default = 186)87CLI.add_argument(88    '--gpus',89    type = int,90    default = 091)92CLI.add_argument(93    '--NE',94    type = bool,95    default = False96)97CLI.add_argument(98    '--workers',99    type = int,100    default = 5101)102CLI.add_argument(103    '--evaluate',104    type = bool,105    default = False106)107CLI.add_argument(108    '--eval_ep',109    type = int,110    default = 1111)112args = CLI.parse_args()113eps = 1e-6114#setting in miner's dilemma115ACTION_SPACE = spaces.Box(low=np.array([0.]), high=np.array([1.]), dtype=np.float32)116STATE_SPACE = spaces.Discrete(1)117NE = dict()118def get_optimal_strategy(a, b, y):119    x = sym.Symbol('x', real=True)120    R1 = (a - x) / (1. - x - y)121    R2 = (b - y) / (1. - x - y)122    r1 = ((b * R1) + x * (R1 + R2)) / (a * b + a * x + b * y)123    d1 = sym.Eq(sym.diff(r1, x), 0.)124    A = sym.solve(d1, x)125    126    if A:127        for i in A:128            if (i > eps and i < a - eps):129                return i, r1.subs(x, i)130    if (a * b + b * y < eps or r1.subs(x, 0.) > r1.subs(x, a) - eps):131        return 0., r1.subs(x, 0.)132    else:133        return a, r1.subs(x, a)134def plot_Nash_equilibrium(x, y, z, name):135    x, y = np.meshgrid(x,y)136    z = z.transpose()137    intensity = z.reshape(len(y), len(x))138   139    plt.title(name)140    plt.pcolormesh(x, y, intensity, rasterized=True)141    plt.clim(0., 1.2)142    plt.colorbar() #need a colorbar to show the intensity scale143    #plt.show() #boom144   145def compute_reward(a, b, x, y):146    if (x + y > 1 - eps):147        return {'0': 0., '1': 0.}148    if (y < eps and a < eps):149        return {'0': 1., '1': 1.}150    if (x < eps and b < eps):151        return {'0': 1., '1': 1.}152    R1 = (a - x) / (1. - x - y)153    R2 = (b - y) / (1. - x - y)154    r1 = ((b * R1) + x * (R1 + R2)) / (a * b + a * x + b * y)155    r2 = ((a * R2) + y * (R1 + R2)) / (a * b + a * x + b * y)156    return {'0': r1, '1': r2}157def get_Nash_equilibrium(alphas):158    a = alphas[0]159    b = alphas[1]160    if (a + b > 1. or (a < eps and b < eps)): 161        return 0., 0., 1., 1.162    x = 0.163    y = 0.164    while (True):165        X, R1 = get_optimal_strategy(a, b, y)166        Y, R2 = get_optimal_strategy(b, a, x)167        168        if (abs(X - x) < eps and abs(Y - y) < eps):169            rev = compute_reward(a, b, x, y)170            return x, y, rev['0'], rev['1']171        172        x = X173        y = Y174class MigrationEnv(MultiAgentEnv):175    def __init__(self, env_config):176        self.action_space = ACTION_SPACE177        self.observation_space = STATE_SPACE178        self.HASHRATE = np.array(env_config['alphas'])179        self.alphas = np.array(env_config['alphas'])180        self.N = len(self.alphas)181        self.episode_length = env_config['ep_length']182        self.attr = np.full((self.N), 1.)183        self.impact = args.impact184        self.threshold = args.threshold185        self.largest_pool = np.full((self.N, 2), -1)186        self.num_moves = 0187   188    def compute_states(self):189        obs_state = dict()190        self.largest_pool = np.full((self.N, 2), -1)191        for i in range(len(self.alphas)):192            tmp = np.array([self.alphas[i], 0., 0., 0.])193            rest = []194            195            for j in range(len(self.alphas)):196                if i == j:197                    continue198                if self.alphas[j] >= tmp[1]:199                    if (self.largest_pool[i][1] > -1):200                        rest.append(tmp[2])201                    tmp[2] = tmp[1]202                    self.largest_pool[i][1] = self.largest_pool[i][0]203                    tmp[1] = self.alphas[j]204                    self.largest_pool[i][0] = j205                elif self.alphas[j] > tmp[2]:206                    if (self.largest_pool[i][1] > -1):207                        rest.append(tmp[2])208                    tmp[2] = self.alphas[j]209                    self.largest_pool[i][1] = j210                else: 211                    rest.append(self.alphas[j])212            tmp[3] = np.array(rest).std()213            obs_state[str(i)] = tmp214        return obs_state215    #reset the environment to the starting state216    def reset(self):217        self.num_moves = 0218        self.alphas = np.array(self.HASHRATE)219        self.attr = np.full((self.N), 1.)220        return self.compute_states() 221    def construct_action(self, action_dict):222        action = np.empty([self.N, self.N], dtype=np.float32)223        for i in range(self.N):224            action[i] = np.full((self.N), self.alphas[i] * action_dict[str(i)][2])225            action[i][i] = 0.226            if self.largest_pool[i][0] > -1:227                action[i][self.largest_pool[i][0]] = self.alphas[i] * action_dict[str(i)][0]228            if self.largest_pool[i][1] > -1:229                action[i][self.largest_pool[i][1]] = self.alphas[i] * action_dict[str(i)][1]230            if (action[i].sum() > 1 - eps):231                action[i] = action[i] / (action[i] + eps)232        233        return action234    235    def step(self, action_dict):236        self.num_moves += 1237        a = np.empty([self.N, self.N], dtype=np.float32)238        b = np.empty([self.N], dtype=np.float32)239        action = self.construct_action(action_dict)240        #print("states:{}\n{}\n{}\n".format(self.compute_states(), action_dict, action))241        infiltrate = action.sum(1)242        infiltrated = action.sum(0)243        total = action.sum()244        for i in range(self.N):245            for j in range(self.N):246                if i == j:247                    a[i][j] = self.alphas[i] + infiltrated[i]248                else:249                    a[i][j] = -action[i][j]250            b[i] = (self.alphas[i] - infiltrate[i]) / (1 - total)251        r = np.empty([self.N], dtype=np.float32)252        try:253            r = np.linalg.solve(a, b)254        except(RuntimeError, np.linalg.LinAlgError):255            r = np.full((self.N), 1.)256        R = dict()257        for i in range(self.N):258            R[str(i)] = r[i]259        done = {"__all__": self.num_moves >= self.episode_length}260        for i in range(self.N):261            self.attr[i] = max(0., min(1., self.attr[i] + self.impact * (r[i] - 1.)))262        tmp_alphas = np.array(self.alphas)263        for i in range(self.N):264            sumn = tmp_alphas[i] * max(0., 1. - self.attr[i] - self.threshold)265            self.alphas[i] -= sumn266            mean = np.array(self.attr) / self.attr.sum()267            cov = np.diag(mean) - np.dot(np.transpose([mean]), [mean])268            mig = np.random.multivariate_normal(sumn * mean, sumn * cov)269            for j in range(self.N):270                #self.alphas[i] += tmp_alphas[j] * max(0, 1 - self.attr[j] - self.threshold) * self.attr[i] / self.attr.sum()271                self.alphas[j] += mig[j]272        assert(abs(self.alphas.sum() - 1.) < eps)273        274        alphas = dict()275        for i in range(self.N):276            alphas[str(i)] = self.alphas[i] - tmp_alphas[i]277        278        info = dict()279        for i in range(self.N):280            info[str(i)] = {'policy': np.array(action[i]), 'reward': r[i], 'alphas': self.alphas[i]}281        return self.compute_states(), alphas, done, info282class BlockWithholdingEnv(MultiAgentEnv):283    def __init__(self, env_config):284        self.action_space = ACTION_SPACE285        self.observation_space = STATE_SPACE286        self.alphas = env_config['alphas']287        self.N = len(self.alphas)288        self.honest_power = 1 - sum(self.alphas)289        self.episode_length = env_config['ep_length']290        self.num_moves = 0291    292    #reset the environment to the starting state293    def reset(self):294        self.num_moves = 0295        return {296            '0': 0,297            '1': 0298        }299    300    def step(self, action_dict):301        self.num_moves += 1302        a = self.alphas[0]303        b = self.alphas[1]304        x = action_dict['0'][0]305        y = action_dict['1'][0]306        done = {"__all__": self.num_moves >= self.episode_length}307        R = compute_reward(a, b, x * a, y * b)308        info = dict()309        info['0'] = {'policy': x * a, 'reward': R['0']}310        info['1'] = {'policy': y * b, 'reward': R['1']}311        return {'0': 0, '1': 0}, R, done, info312class Constant(Policy):313    def __init__(self, observation_space, action_space, config):314        Policy.__init__(self, observation_space, action_space, config)315        self.infiltrating = config['infiltrating']316    def compute_actions(self,317                        obs_batch,318                        state_batches,319                        prev_action_batch=None,320                        prev_reward_batch=None,321                        info_batch=None,322                        episodes=None,323                        **kwargs):324        actions = []325        for i in range(len(obs_batch)):326            actions.append([self.infiltrating])327        return actions, [], {}328    def learn_on_batch(self, samples):329        pass330    def get_weights(self):331        pass332    def set_weights(self, weights):333        pass334class NE_strategy(Policy):335    def __init__(self, observation_space, action_space, config):336        Policy.__init__(self, observation_space, action_space, config)337        x, y, r1, r2 = get_Nash_equilibrium(config['alphas'])338        self.infiltrating = y / config['alphas'][1]339    340    341    def compute_actions(self,342                        obs_batch,343                        state_batches,344                        prev_action_batch=None,345                        prev_reward_batch=None,346                        info_batch=None,347                        episodes=None,348                        **kwargs):349        actions = []350        for i in range(len(obs_batch)):351            actions.append([self.infiltrating])352        return actions, [], {}353    def learn_on_batch(self, samples):354        pass355    def get_weights(self):356        pass357    def set_weights(self, weights):358        pass359def on_episode_start(info):360    episode = info["episode"]361def on_episode_step(info):362    episode = info["episode"]363    episode.user_data['0'] = episode.last_info_for('0')364    episode.user_data['1'] = episode.last_info_for('1')365def on_episode_end(info):366    episode = info["episode"]367    print(episode.user_data)368def run_RL(policies_to_train, policies):369    def select_policy(agent_id):370        return agent_id371    372    tune.run(373        args.algo,374        stop={"episodes_total": args.episodes},375        config={376            "num_gpus": args.gpus,377            "env": BlockWithholdingEnv,378            "entropy_coeff": 0.01,379            "entropy_coeff_schedule": args.episodes * 1000,380            "clip_param": 0.1,381            "gamma": args.gamma,382            "lambda": args.lmbda,383            "lr_schedule": [[0, 1e-5], [args.episodes, 1e-7]],384            "num_workers": args.workers,385            "num_envs_per_worker": 1,386            "sample_batch_size": 10,387            "train_batch_size": 128,388            "multiagent": {389                "policies_to_train": policies_to_train,390                "policies": policies,391                "policy_mapping_fn": select_policy,392            },393            "env_config": {394                "alphas":args.alphas,395                'ep_length':args.ep_length396            },397            "monitor": True,398            "callbacks": {399                "on_episode_start": on_episode_start,400                "on_episode_step": on_episode_step,401                "on_episode_end": on_episode_end,402            },403            "ignore_worker_failures": True,404        })405NE['a0'], NE['a1'], NE['r1'], NE['r2'] = get_Nash_equilibrium(args.alphas)406print(args.alphas, NE)407policies_to_train = [str(i) for i in range(len(args.alphas))]408policies = dict()409for i in range(len(args.alphas)):410    policies[str(i)] = (None, STATE_SPACE, ACTION_SPACE, {411        "model": {412            "use_lstm":args.use_lstm413        }414    })...

debug.py

Source:debug.py

1import torch2import numpy as np3import pickle4from mars.equilibrium_solver import NashEquilibriumECOSSolver, NashEquilibriumMWUSolver, NashEquilibriumParallelMWUSolver5DEBUG = False6def kl(p, q):7    """Kullback-Leibler divergence D(P || Q) for discrete distributions8    Parameters9    ----------10    p, q : array-like, dtype=float, shape=n11    Discrete probability distributions.12    """13    p = np.asarray(p, dtype=np.float)14    q = np.asarray(q, dtype=np.float)15    return np.sum(np.where(p != 0, p * np.log(p / q), 0))16def to_one_hot(s, range):17    one_hot_vec = np.zeros(range)18    one_hot_vec[s] = 119    return one_hot_vec20class Debugger():21    def __init__(self, env, log_path = None):22        self.env = env23        if env.OneHotObs:24            self.num_states_per_step = int(self.env.observation_space.shape[0])25        else:26            self.num_states_per_step = int(self.env.observation_space.high[0]/(self.env.max_transition+1))27        self.max_transition = env.max_transition28        self.kl_dist_list=[[] for _ in range(self.max_transition)]29        self.mse_v_list=[[] for _ in range(self.max_transition)]30        self.mse_exp_list=[[] for _ in range(self.max_transition)]31        self.brv_list = []32        self.cnt = 033        self.save_interval = 1034        self.logging = {'num_states_per_step': self.num_states_per_step,35                        'max_transition': self.max_transition,36                        'oracle_exploitability': np.mean(self.env.Nash_v[0], axis=0),  # the average nash value for initial states from max-player's view37                        'cnt': [],38                        'state_visit': {},39                        'kl_nash_dist': [],40                        'mse_nash_v': [],41                        'mse_exploitability': []42                        }43        self.log_path = log_path 44        self.state_list = []45        self.oracle_nash_strategies = np.vstack(self.env.Nash_strategies) # flatten to shape dim 146        self.oracle_nash_values = np.concatenate(self.env.Nash_v) # flatten to shape dim 147        self.oracle_nash_q_values = np.concatenate(self.env.Nash_q) # flatten to shape dim 148        self.trans_prob_matrices = self.env.env.trans_prob_matrices49        self.reward_matrices = self.env.env.reward_matrices50        print('oracle nash v star: ', np.mean(self.env.Nash_v[0], axis=0))  # the average nash value for initial states from max-player's view51    def best_response_value(self, learned_q):52        """53        Formulas for calculating best response values:54        1. Nash strategies: (\pi_a^*, \pi_b^*) = \min \max Q(s,a,b), 55            where Q(s,a,b) = r(s,a,b) + \gamma \min \max Q(s',a',b') (this is the definition of Nash Q-value);56        2. Best response (of max player) value: Br V(s) = \min_b \pi(s,a) Q(s,a,b)57        """58        Br_v = []59        Br_q = []60        Nash_strategies = []61        num_actions = learned_q.shape[-1]62        for tm, rm, qm in zip(self.trans_prob_matrices[::-1], self.reward_matrices[::-1], learned_q[::-1]): # inverse enumerate 63            if len(Br_v) > 0:64                rm = np.array(rm)+np.array(Br_v[-1])  # broadcast sum on rm's last dim, last one in Nash_v is for the next state65            br_q_values = np.einsum("ijk,ijk->ij", tm, rm)  # transition prob * reward for the last dimension in (state, action, next_state)66            br_q_values = br_q_values.reshape(-1, num_actions, num_actions) # action list to matrix67            Br_q.append(br_q_values)68            br_values = []69            ne_strategies = []70            for q, br_q in zip(qm, br_q_values):71                ne, _ = NashEquilibriumECOSSolver(q)72                ne_strategies.append(ne)73                br_value = np.min(ne[0]@br_q)  # best response againt "Nash" strategy of first player74                br_values.append(br_value)  # each value is a Nash equilibrium value on one state75            Br_v.append(br_values)  # (trans, state)76            Nash_strategies.append(ne_strategies)77        Br_v = Br_v[::-1]  # (#trans, #states)78        Br_q = Br_q[::-1]79        Nash_strategies = Nash_strategies[::-1]80        avg_init_br_v = -np.mean(Br_v[0])  # average best response value of initial states; minus for making it positive81        return avg_init_br_v82    def compare_with_oracle(self, state, dists, ne_vs, ne_q_vs, verbose=False):83        """[summary]84        :param state: current state85        :type state: [type]86        :param dists: predicted Nash strategies (distributions)87        :type dists: [type]88        :param ne_vs: predicted Nash equilibrium values based on predicted Nash strategies89        :type ne_vs: [type]90        :param verbose: [description], defaults to False91        :type verbose: bool, optional92        """93        self.cnt+=194        if self.env.OneHotObs:95            state_ = state[0].cpu().numpy()96            id_state = np.where(state_>0)[0][0]97        else:98            id_state =  int(torch.sum(state).cpu().numpy()/2)99        for j in range(self.max_transition):  # nash value for non-terminal states (before the final timestep)100            if id_state >= j*self.num_states_per_step and id_state < (j+1)*self.num_states_per_step:  # determine which timestep is current state101                ne_strategy = self.oracle_nash_strategies[id_state]102                ne_v = self.oracle_nash_values[id_state]103                ne_q = self.oracle_nash_q_values[id_state]104                oracle_first_player_ne_strategy = ne_strategy[0]105                nash_dqn_first_player_ne_strategy = dists[0][0]106                br_v = np.min(nash_dqn_first_player_ne_strategy@ne_q)  # best response value (value against best response), reflects exploitability of learned Nash; but this minimization is taken with oracle nash 107                kl_dist = kl(oracle_first_player_ne_strategy, nash_dqn_first_player_ne_strategy)108                self.kl_dist_list[j].append(kl_dist)109                mse_v = float((ne_v - ne_vs)**2) # squared error of Nash values (predicted and oracle)110                self.mse_v_list[j].append(mse_v)111                ### this is the exploitability/regret for each state; but not calcuated correctly, the minimization should take over best-response Q value rather than nash Q (neither oracle nor learned)112                mse_exp = float((ne_v - br_v)**2)  # the target value of best response value (exploitability) should be the Nash value113                self.mse_exp_list[j].append(mse_exp)114        ## this is the correct calculation of exploitability: average best-response value of the inital states115        brv = self.best_response_value(ne_q_vs, )116        self.brv_list.append(brv)117        self.state_visit(id_state)118        self.log([id_state, kl_dist, ne_vs], verbose)119        if self.cnt % self.save_interval == 0:120            self.dump_log()121    def state_visit(self, state):122        self.state_list.append(state)123    def log(self, data, verbose=False):124        # get state visitation statistics125        unique, counts = np.unique(self.state_list, return_counts=True)126        state_stat = dict(zip(unique, counts))127        if verbose:128            print('state index: {}ï¼ KL: {}'.format(*data))129            print('state visitation counts: {}'.format(state_stat))130        self.logging['cnt'].append(self.cnt)131        self.logging['state_visit'] = state_stat132        self.logging['kl_nash_dist'] = self.kl_dist_list133        self.logging['mse_nash_v'] = self.mse_v_list134        self.logging['mse_exploitability'] = self.mse_exp_list135        self.logging['brv'] = self.brv_list136    def dump_log(self,):137        with open(self.log_path, "wb") as f:...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.