How to use save_resume_state method in Slash

Best Python code snippet using slash

ppo_trainer.py

Source:ppo_trainer.py Github

copy

Full Screen

...632 if ppo_cfg.use_linear_clip_decay:633 self.agent.clip_param = ppo_cfg.clip_param * (634 1 - self.percent_done()635 )636 if rank0_only() and self._should_save_resume_state():637 requeue_stats = dict(638 env_time=self.env_time,639 pth_time=self.pth_time,640 count_checkpoints=count_checkpoints,641 num_steps_done=self.num_steps_done,642 num_updates_done=self.num_updates_done,643 _last_checkpoint_percent=self._last_checkpoint_percent,644 prev_time=(time.time() - self.t_start) + prev_time,645 running_episode_stats=self.running_episode_stats,646 window_episode_stats=dict(self.window_episode_stats),647 )648 save_resume_state(649 dict(650 state_dict=self.agent.state_dict(),651 optim_state=self.agent.optimizer.state_dict(),652 lr_sched_state=lr_scheduler.state_dict(),653 config=self.config,654 requeue_stats=requeue_stats,655 ),656 self.config,657 )658 if EXIT.is_set():659 profiling_wrapper.range_pop() # train update660 self.envs.close()661 requeue_job()662 return...

Full Screen

Full Screen

base_trainer.py

Source:base_trainer.py Github

copy

Full Screen

...132 )133 # We save a resume state during evaluation so that134 # we can resume evaluating incase the job gets135 # preempted.136 save_resume_state(137 {138 "config": self.config,139 "prev_ckpt_ind": prev_ckpt_ind,140 },141 self.config,142 filename_key="eval",143 )144 if (prev_ckpt_ind + 1) == self.config.NUM_CHECKPOINTS:145 break146 def _eval_checkpoint(147 self,148 checkpoint_path: str,149 writer: TensorboardWriter,150 checkpoint_index: int = 0,151 ) -> None:152 raise NotImplementedError153 def save_checkpoint(self, file_name) -> None:154 raise NotImplementedError155 def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict:156 raise NotImplementedError157class BaseRLTrainer(BaseTrainer):158 r"""Base trainer class for RL trainers. Future RL-specific159 methods should be hosted here.160 """161 device: torch.device # type: ignore162 config: Config163 video_option: List[str]164 num_updates_done: int165 num_steps_done: int166 _flush_secs: int167 _last_checkpoint_percent: float168 def __init__(self, config: Config) -> None:169 super().__init__()170 assert config is not None, "needs config file to initialize trainer"171 self.config = config172 self._flush_secs = 30173 self.num_updates_done = 0174 self.num_steps_done = 0175 self._last_checkpoint_percent = -1.0176 if config.NUM_UPDATES != -1 and config.TOTAL_NUM_STEPS != -1:177 raise RuntimeError(178 "NUM_UPDATES and TOTAL_NUM_STEPS are both specified. One must be -1.\n"179 " NUM_UPDATES: {} TOTAL_NUM_STEPS: {}".format(180 config.NUM_UPDATES, config.TOTAL_NUM_STEPS181 )182 )183 if config.NUM_UPDATES == -1 and config.TOTAL_NUM_STEPS == -1:184 raise RuntimeError(185 "One of NUM_UPDATES and TOTAL_NUM_STEPS must be specified.\n"186 " NUM_UPDATES: {} TOTAL_NUM_STEPS: {}".format(187 config.NUM_UPDATES, config.TOTAL_NUM_STEPS188 )189 )190 if config.NUM_CHECKPOINTS != -1 and config.CHECKPOINT_INTERVAL != -1:191 raise RuntimeError(192 "NUM_CHECKPOINTS and CHECKPOINT_INTERVAL are both specified."193 " One must be -1.\n"194 " NUM_CHECKPOINTS: {} CHECKPOINT_INTERVAL: {}".format(195 config.NUM_CHECKPOINTS, config.CHECKPOINT_INTERVAL196 )197 )198 if config.NUM_CHECKPOINTS == -1 and config.CHECKPOINT_INTERVAL == -1:199 raise RuntimeError(200 "One of NUM_CHECKPOINTS and CHECKPOINT_INTERVAL must be specified"201 " NUM_CHECKPOINTS: {} CHECKPOINT_INTERVAL: {}".format(202 config.NUM_CHECKPOINTS, config.CHECKPOINT_INTERVAL203 )204 )205 def percent_done(self) -> float:206 if self.config.NUM_UPDATES != -1:207 return self.num_updates_done / self.config.NUM_UPDATES208 else:209 return self.num_steps_done / self.config.TOTAL_NUM_STEPS210 def is_done(self) -> bool:211 return self.percent_done() >= 1.0212 def should_checkpoint(self) -> bool:213 needs_checkpoint = False214 if self.config.NUM_CHECKPOINTS != -1:215 checkpoint_every = 1 / self.config.NUM_CHECKPOINTS216 if (217 self._last_checkpoint_percent + checkpoint_every218 < self.percent_done()219 ):220 needs_checkpoint = True221 self._last_checkpoint_percent = self.percent_done()222 else:223 needs_checkpoint = (224 self.num_updates_done % self.config.CHECKPOINT_INTERVAL225 ) == 0226 return needs_checkpoint227 def _should_save_resume_state(self) -> bool:228 return SAVE_STATE.is_set() or (229 (230 not self.config.RL.preemption.save_state_batch_only231 or is_slurm_batch_job()232 )233 and (234 (235 int(self.num_updates_done + 1)236 % self.config.RL.preemption.save_resume_state_interval237 )238 == 0239 )240 )241 @property...

Full Screen

Full Screen

resuming.py

Source:resuming.py Github

copy

Full Screen

...74 old_sessions_ids = [session.session_id for session in old_sessions_query.all()]75 if old_sessions_ids:76 conn.query(ResumeState).filter(ResumeState.session_id.in_(old_sessions_ids)).delete(synchronize_session=False)77 old_sessions_query.delete(synchronize_session=False)78def save_resume_state(session_result, collected_tests):79 session_metadata = SessionMetadata(80 session_id=session_result.session.id,81 src_folder=os.path.abspath(os.getcwd()),82 created_at=datetime.now())83 tests_to_resume = []84 for result in session_result.iter_test_results():85 metadata = result.test_metadata86 test_to_resume = ResumeState(session_id=session_result.session.id, file_name=metadata.file_path, address_in_file=metadata.address_in_file)87 test_to_resume.variation = str(metadata.variation.id) if metadata.variation else None88 if result.is_success_finished():89 test_to_resume.status = ResumeTestStatus.SUCCESS90 elif not result.is_started() or result.is_skip():91 test_to_resume.status = ResumeTestStatus.PLANNED92 else:...

Full Screen

Full Screen

ddp_utils.py

Source:ddp_utils.py Github

copy

Full Screen

...106 # SLURM always sends SIGTERM so we can use this to save and exit107 signal.signal(signal.SIGTERM, _clean_exit_and_save_handler)108 signal.signal(signal.SIGUSR1, _requeue_handler)109@rank0_only110def save_resume_state(state: Any, filename_or_config: Union[Config, str]):111 r"""Saves the resume job state to the specified filename.112 This is useful when working with preemptable job partitions.113 :param state: The state to save114 :param filename_or_config: The filename of the saved state or the config to construct it.115 """116 if isinstance(filename_or_config, Config):117 filename = resume_state_filename(filename_or_config)118 else:119 filename = filename_or_config120 torch.save(state, filename)121def load_resume_state(filename_or_config: Union[Config, str]) -> Optional[Any]:122 r"""Loads the saved resume state123 :param filename_or_config: The filename of the saved state or the config to construct it.124 :return: The saved state if the file exists, else none...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run Slash automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful