How to use save_resume_state method in Slash

Best Python code snippet using slash

ppo_trainer.py

Source:ppo_trainer.py

...632                if ppo_cfg.use_linear_clip_decay:633                    self.agent.clip_param = ppo_cfg.clip_param * (634                        1 - self.percent_done()635                    )636                if rank0_only() and self._should_save_resume_state():637                    requeue_stats = dict(638                        env_time=self.env_time,639                        pth_time=self.pth_time,640                        count_checkpoints=count_checkpoints,641                        num_steps_done=self.num_steps_done,642                        num_updates_done=self.num_updates_done,643                        _last_checkpoint_percent=self._last_checkpoint_percent,644                        prev_time=(time.time() - self.t_start) + prev_time,645                        running_episode_stats=self.running_episode_stats,646                        window_episode_stats=dict(self.window_episode_stats),647                    )648                    save_resume_state(649                        dict(650                            state_dict=self.agent.state_dict(),651                            optim_state=self.agent.optimizer.state_dict(),652                            lr_sched_state=lr_scheduler.state_dict(),653                            config=self.config,654                            requeue_stats=requeue_stats,655                        ),656                        self.config,657                    )658                if EXIT.is_set():659                    profiling_wrapper.range_pop()  # train update660                    self.envs.close()661                    requeue_job()662                    return...

base_trainer.py

Source:base_trainer.py

...132                    )133                    # We save a resume state during evaluation so that134                    # we can resume evaluating incase the job gets135                    # preempted.136                    save_resume_state(137                        {138                            "config": self.config,139                            "prev_ckpt_ind": prev_ckpt_ind,140                        },141                        self.config,142                        filename_key="eval",143                    )144                    if (prev_ckpt_ind + 1) == self.config.NUM_CHECKPOINTS:145                        break146    def _eval_checkpoint(147        self,148        checkpoint_path: str,149        writer: TensorboardWriter,150        checkpoint_index: int = 0,151    ) -> None:152        raise NotImplementedError153    def save_checkpoint(self, file_name) -> None:154        raise NotImplementedError155    def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict:156        raise NotImplementedError157class BaseRLTrainer(BaseTrainer):158    r"""Base trainer class for RL trainers. Future RL-specific159    methods should be hosted here.160    """161    device: torch.device  # type: ignore162    config: Config163    video_option: List[str]164    num_updates_done: int165    num_steps_done: int166    _flush_secs: int167    _last_checkpoint_percent: float168    def __init__(self, config: Config) -> None:169        super().__init__()170        assert config is not None, "needs config file to initialize trainer"171        self.config = config172        self._flush_secs = 30173        self.num_updates_done = 0174        self.num_steps_done = 0175        self._last_checkpoint_percent = -1.0176        if config.NUM_UPDATES != -1 and config.TOTAL_NUM_STEPS != -1:177            raise RuntimeError(178                "NUM_UPDATES and TOTAL_NUM_STEPS are both specified.  One must be -1.\n"179                " NUM_UPDATES: {} TOTAL_NUM_STEPS: {}".format(180                    config.NUM_UPDATES, config.TOTAL_NUM_STEPS181                )182            )183        if config.NUM_UPDATES == -1 and config.TOTAL_NUM_STEPS == -1:184            raise RuntimeError(185                "One of NUM_UPDATES and TOTAL_NUM_STEPS must be specified.\n"186                " NUM_UPDATES: {} TOTAL_NUM_STEPS: {}".format(187                    config.NUM_UPDATES, config.TOTAL_NUM_STEPS188                )189            )190        if config.NUM_CHECKPOINTS != -1 and config.CHECKPOINT_INTERVAL != -1:191            raise RuntimeError(192                "NUM_CHECKPOINTS and CHECKPOINT_INTERVAL are both specified."193                "  One must be -1.\n"194                " NUM_CHECKPOINTS: {} CHECKPOINT_INTERVAL: {}".format(195                    config.NUM_CHECKPOINTS, config.CHECKPOINT_INTERVAL196                )197            )198        if config.NUM_CHECKPOINTS == -1 and config.CHECKPOINT_INTERVAL == -1:199            raise RuntimeError(200                "One of NUM_CHECKPOINTS and CHECKPOINT_INTERVAL must be specified"201                " NUM_CHECKPOINTS: {} CHECKPOINT_INTERVAL: {}".format(202                    config.NUM_CHECKPOINTS, config.CHECKPOINT_INTERVAL203                )204            )205    def percent_done(self) -> float:206        if self.config.NUM_UPDATES != -1:207            return self.num_updates_done / self.config.NUM_UPDATES208        else:209            return self.num_steps_done / self.config.TOTAL_NUM_STEPS210    def is_done(self) -> bool:211        return self.percent_done() >= 1.0212    def should_checkpoint(self) -> bool:213        needs_checkpoint = False214        if self.config.NUM_CHECKPOINTS != -1:215            checkpoint_every = 1 / self.config.NUM_CHECKPOINTS216            if (217                self._last_checkpoint_percent + checkpoint_every218                < self.percent_done()219            ):220                needs_checkpoint = True221                self._last_checkpoint_percent = self.percent_done()222        else:223            needs_checkpoint = (224                self.num_updates_done % self.config.CHECKPOINT_INTERVAL225            ) == 0226        return needs_checkpoint227    def _should_save_resume_state(self) -> bool:228        return SAVE_STATE.is_set() or (229            (230                not self.config.RL.preemption.save_state_batch_only231                or is_slurm_batch_job()232            )233            and (234                (235                    int(self.num_updates_done + 1)236                    % self.config.RL.preemption.save_resume_state_interval237                )238                == 0239            )240        )241    @property...

resuming.py

Source:resuming.py

...74        old_sessions_ids = [session.session_id for session in old_sessions_query.all()]75        if old_sessions_ids:76            conn.query(ResumeState).filter(ResumeState.session_id.in_(old_sessions_ids)).delete(synchronize_session=False)77        old_sessions_query.delete(synchronize_session=False)78def save_resume_state(session_result, collected_tests):79    session_metadata = SessionMetadata(80        session_id=session_result.session.id,81        src_folder=os.path.abspath(os.getcwd()),82        created_at=datetime.now())83    tests_to_resume = []84    for result in session_result.iter_test_results():85        metadata = result.test_metadata86        test_to_resume = ResumeState(session_id=session_result.session.id, file_name=metadata.file_path, address_in_file=metadata.address_in_file)87        test_to_resume.variation = str(metadata.variation.id) if metadata.variation else None88        if result.is_success_finished():89            test_to_resume.status = ResumeTestStatus.SUCCESS90        elif not result.is_started() or result.is_skip():91            test_to_resume.status = ResumeTestStatus.PLANNED92        else:...

ddp_utils.py

Source:ddp_utils.py

...106    # SLURM always sends SIGTERM so we can use this to save and exit107    signal.signal(signal.SIGTERM, _clean_exit_and_save_handler)108    signal.signal(signal.SIGUSR1, _requeue_handler)109@rank0_only110def save_resume_state(state: Any, filename_or_config: Union[Config, str]):111    r"""Saves the resume job state to the specified filename.112        This is useful when working with preemptable job partitions.113    :param state: The state to save114    :param filename_or_config: The filename of the saved state or the config to construct it.115    """116    if isinstance(filename_or_config, Config):117        filename = resume_state_filename(filename_or_config)118    else:119        filename = filename_or_config120    torch.save(state, filename)121def load_resume_state(filename_or_config: Union[Config, str]) -> Optional[Any]:122    r"""Loads the saved resume state123    :param filename_or_config: The filename of the saved state or the config to construct it.124    :return: The saved state if the file exists, else none...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.