Best Python code snippet using behave
hadoop.py
Source:hadoop.py  
1# Copyright 2009-2012 Yelp and Contributors2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14import logging15import posixpath16import re17from subprocess import Popen18from subprocess import PIPE19from subprocess import CalledProcessError20try:21    from cStringIO import StringIO22    StringIO  # quiet "redefinition of unused ..." warning from pyflakes23except ImportError:24    from StringIO import StringIO25from mrjob.fs.base import Filesystem26from mrjob.parse import is_uri27from mrjob.parse import urlparse28from mrjob.util import cmd_line29from mrjob.util import read_file30log = logging.getLogger('mrjob.fs.hadoop')31# used by mkdir()32HADOOP_FILE_EXISTS_RE = re.compile(r'.*File exists.*')33# used by ls()34HADOOP_LSR_NO_SUCH_FILE = re.compile(35    r'^lsr: Cannot access .*: No such file or directory.')36# used by rm() (see below)37HADOOP_RMR_NO_SUCH_FILE = re.compile(r'^rmr: hdfs://.*$')38class HadoopFilesystem(Filesystem):39    """Filesystem for URIs accepted by ``hadoop fs``. Typically you will get40    one of these via ``HadoopJobRunner().fs``, composed with41    :py:class:`~mrjob.fs.local.LocalFilesystem`.42    """43    def __init__(self, hadoop_bin):44        """:param hadoop_bin: path to ``hadoop`` binary"""45        super(HadoopFilesystem, self).__init__()46        self._hadoop_bin = hadoop_bin47    def can_handle_path(self, path):48        return is_uri(path)49    def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,50                       return_stdout=False):51        """Run the given hadoop command, raising an exception on non-zero52        return code. This only works for commands whose output we don't53        care about.54        Args:55        ok_returncodes -- a list/tuple/set of return codes we expect to56            get back from hadoop (e.g. [0,1]). By default, we only expect 0.57            If we get an unexpected return code, we raise a CalledProcessError.58        ok_stderr -- don't log STDERR or raise CalledProcessError if stderr59            matches a regex in this list (even if the returncode is bad)60        return_stdout -- return the stdout from the hadoop command rather61            than logging it. If this is False, we return the returncode62            instead.63        """64        args = self._hadoop_bin + args65        log.debug('> %s' % cmd_line(args))66        proc = Popen(args, stdout=PIPE, stderr=PIPE)67        stdout, stderr = proc.communicate()68        log_func = log.debug if proc.returncode == 0 else log.error69        if not return_stdout:70            for line in StringIO(stdout):71                log_func('STDOUT: ' + line.rstrip('\r\n'))72        # check if STDERR is okay73        stderr_is_ok = False74        if ok_stderr:75            for stderr_re in ok_stderr:76                if stderr_re.match(stderr):77                    stderr_is_ok = True78                    break79        if not stderr_is_ok:80            for line in StringIO(stderr):81                log_func('STDERR: ' + line.rstrip('\r\n'))82        ok_returncodes = ok_returncodes or [0]83        if not stderr_is_ok and proc.returncode not in ok_returncodes:84            raise CalledProcessError(proc.returncode, args)85        if return_stdout:86            return stdout87        else:88            return proc.returncode89    def du(self, path_glob):90        """Get the size of a file, or None if it's not a file or doesn't91        exist."""92        try:93            stdout = self.invoke_hadoop(['fs', '-dus', path_glob],94                                        return_stdout=True)95        except CalledProcessError:96            raise IOError(path_glob)97        try:98            return sum(int(line.split()[1])99                       for line in stdout.split('\n')100                       if line.strip())101        except (ValueError, TypeError, IndexError):102            raise IOError(103                'Unexpected output from hadoop fs -du: %r' % stdout)104    def ls(self, path_glob):105        components = urlparse(path_glob)106        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)107        try:108            stdout = self.invoke_hadoop(109                ['fs', '-lsr', path_glob],110                return_stdout=True,111                ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])112        except CalledProcessError:113            raise IOError("Could not ls %s" % path_glob)114        for line in StringIO(stdout):115            line = line.rstrip('\r\n')116            fields = line.split(' ')117            # Throw out directories118            if fields[0].startswith('d'):119                continue120            # Try to figure out which part of the line is the path121            # Expected lines:122            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar # HDFS123            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar # S3124            path_index = None125            for index, field in enumerate(fields):126                if len(field) == 5 and field[2] == ':':127                    path_index = (index + 1)128            if not path_index:129                raise IOError("Could not locate path in string '%s'" % line)130            path = line.split(' ', path_index)[-1]131            # handle fully qualified URIs from newer versions of Hadoop ls132            # (see Pull Request #577)133            if is_uri(path):134                yield path135            else:136                yield hdfs_prefix + path137    def _cat_file(self, filename):138        # stream from HDFS139        cat_args = self._hadoop_bin + ['fs', '-cat', filename]140        log.debug('> %s' % cmd_line(cat_args))141        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)142        def stream():143            for line in cat_proc.stdout:144                yield line145            # there shouldn't be any stderr146            for line in cat_proc.stderr:147                log.error('STDERR: ' + line)148            returncode = cat_proc.wait()149            if returncode != 0:150                raise IOError("Could not stream %s" % filename)151        return read_file(filename, stream())152    def mkdir(self, path):153        try:154            self.invoke_hadoop(155                ['fs', '-mkdir', path], ok_stderr=[HADOOP_FILE_EXISTS_RE])156        except CalledProcessError:157            raise IOError("Could not mkdir %s" % path)158    def path_exists(self, path_glob):159        """Does the given path exist?160        If dest is a directory (ends with a "/"), we check if there are161        any files starting with that path.162        """163        try:164            return_code = self.invoke_hadoop(['fs', '-ls', path_glob],165                                             ok_returncodes=[0,-1,255])166            return (return_code == 0)167        except CalledProcessError:168            raise IOError("Could not check path %s" % path_glob)169    def path_join(self, dirname, filename):170        return posixpath.join(dirname, filename)171    def rm(self, path_glob):172        if not is_uri(path_glob):173            super(HadoopFilesystem, self).rm(path_glob)174        if self.path_exists(path_glob):175            # hadoop fs -rmr will print something like:176            # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf177            # to STDOUT, which we don't care about.178            #179            # if we ask to delete a path that doesn't exist, it prints180            # to STDERR something like:181            # rmr: <path>182            # which we can safely ignore183            try:184                self.invoke_hadoop(185                    ['fs', '-rmr', path_glob],186                    return_stdout=True, ok_stderr=[HADOOP_RMR_NO_SUCH_FILE])187            except CalledProcessError:188                raise IOError("Could not rm %s" % path_glob)189    def touchz(self, dest):190        try:191            self.invoke_hadoop(['fs', '-touchz', dest])192        except CalledProcessError:...detector.py
Source:detector.py  
1#2# Licensed under the Apache License, Version 2.0 (the "License");3# you may not use this file except in compliance with the License.4# You may obtain a copy of the License at5#6#     http://www.apache.org/licenses/LICENSE-2.07#8# Unless required by applicable law or agreed to in writing, software9# distributed under the License is distributed on an "AS IS" BASIS,10# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.11# See the License for the specific language governing permissions and12# limitations under the License.13#14"""Detect Thermos tasks on disk15This module contains the TaskDetector, used to detect Thermos tasks within a given checkpoint root.16"""17import functools18import glob19import os20import re21from abc import abstractmethod22from twitter.common.lang import Compatibility, Interface23from apache.thermos.common.constants import DEFAULT_CHECKPOINT_ROOT24from apache.thermos.common.path import TaskPath25class PathDetector(Interface):26  @abstractmethod27  def get_paths(self):28    """Get a list of valid checkpoint roots."""29class FixedPathDetector(PathDetector):30  def __init__(self, path=DEFAULT_CHECKPOINT_ROOT):31    if not isinstance(path, Compatibility.string):32      raise TypeError('FixedPathDetector path should be a string, got %s' % type(path))33    self._paths = [path]34  def get_paths(self):35    return self._paths[:]36class ChainedPathDetector(PathDetector):37  def __init__(self, *detectors):38    for detector in detectors:39      if not isinstance(detector, PathDetector):40        raise TypeError('Expected detector %r to be a PathDetector, got %s' % (41            detector, type(detector)))42    self._detectors = detectors43  def get_paths(self):44    def iterate():45      for detector in self._detectors:46        for path in detector.get_paths():47          yield path48    return list(set(iterate()))49def memoized(fn):50  cache_attr_name = '__memoized_' + fn.__name__51  @functools.wraps(fn)52  def memoized_fn(self, *args):53    if not hasattr(self, cache_attr_name):54      setattr(self, cache_attr_name, {})55    cache = getattr(self, cache_attr_name)56    try:57      return cache[args]58    except KeyError:59      cache[args] = rv = fn(self, *args)60      return rv61  return memoized_fn62class TaskDetector(object):63  """64    Helper class in front of TaskPath to detect active/finished/running tasks. Performs no65    introspection on the state of a task; merely detects based on file paths on disk.66  """67  class Error(Exception): pass68  class MatchingError(Error): pass69  def __init__(self, root):70    self._root_dir = root71    self._pathspec = TaskPath()72  @memoized73  def __get_task_ids_patterns(self, state):74    path_glob = self._pathspec.given(75        root=self._root_dir,76        task_id="*",77        state=state or '*'78    ).getpath('task_path')79    path_regex = self._pathspec.given(80        root=re.escape(self._root_dir),81        task_id="(\S+)",82        state='(\S+)'83    ).getpath('task_path')84    return path_glob, re.compile(path_regex)85  def get_task_ids(self, state=None):86    path_glob, path_regex = self.__get_task_ids_patterns(state)87    for path in glob.glob(path_glob):88      try:89        task_state, task_id = path_regex.match(path).groups()90      except Exception:91        continue92      if state is None or task_state == state:93        yield (task_state, task_id)94  @memoized95  def __get_process_runs_patterns(self, task_id, log_dir):96    path_glob = self._pathspec.given(97        root=self._root_dir,98        task_id=task_id,99        log_dir=log_dir,100        process='*',101        run='*'102    ).getpath('process_logdir')103    path_regex = self._pathspec.given(104        root=re.escape(self._root_dir),105        task_id=re.escape(task_id),106        log_dir=log_dir,107        process='(\S+)',108        run='(\d+)'109    ).getpath('process_logdir')110    return path_glob, re.compile(path_regex)111  def get_process_runs(self, task_id, log_dir):112    path_glob, path_regex = self.__get_process_runs_patterns(task_id, log_dir)113    for path in glob.glob(path_glob):114      try:115        process, run = path_regex.match(path).groups()116      except Exception:117        continue118      yield process, int(run)119  def get_process_logs(self, task_id, log_dir):120    for process, run in self.get_process_runs(task_id, log_dir):121      for logtype in ('stdout', 'stderr'):122        path = (self._pathspec.with_filename(logtype).given(root=self._root_dir,123                                                            task_id=task_id,124                                                            log_dir=log_dir,125                                                            process=process,126                                                            run=run)127                                                     .getpath('process_logdir'))128        if os.path.exists(path):129          yield path130  def get_checkpoint(self, task_id):131    return self._pathspec.given(root=self._root_dir, task_id=task_id).getpath('runner_checkpoint')132  @memoized133  def __get_process_checkpoints_patterns(self, task_id):134    path_glob = self._pathspec.given(135        root=self._root_dir,136        task_id=task_id,137        process='*'138    ).getpath('process_checkpoint')139    path_regex = self._pathspec.given(140        root=re.escape(self._root_dir),141        task_id=re.escape(task_id),142        process='(\S+)',143    ).getpath('process_checkpoint')144    return path_glob, re.compile(path_regex)145  def get_process_checkpoints(self, task_id):146    path_glob, path_regex = self.__get_process_checkpoints_patterns(task_id)147    for path in glob.glob(path_glob):148      try:149        process, = path_regex.match(path).groups()150      except Exception:151        continue...scans_to_tfrecords.py
Source:scans_to_tfrecords.py  
1import argparse2from pathlib import Path3import numpy as np4from scipy.ndimage import zoom5import tensorflow as tf6from tqdm import tqdm7import nrrd8from pydicom import dcmread9import nibabel as nib10def preprocess_scan(scan, downsample):11    "Apply some preprocessing to the image"12    if downsample != 1:13        z, y, x = scan.shape14        scan = zoom(scan, (48 / z, 256 / y, 256 / x), order=5)15        # scan = zoom(scan, 1 / downsample)16    scan = scan.astype(np.float32)17    return scan18def scan_to_example(scan):19    "Convert a scan (a NumPy array) to an Example class"20    z, y, x = scan.shape21    scan_raw = scan.tostring()22    scan_features = {23        "z": tf.train.Feature(int64_list=tf.train.Int64List(value=[z])),24        "y": tf.train.Feature(int64_list=tf.train.Int64List(value=[y])),25        "x": tf.train.Feature(int64_list=tf.train.Int64List(value=[x])),26        "scan_raw": tf.train.Feature(27            bytes_list=tf.train.BytesList(value=[scan_raw])28        ),29    }30    return tf.train.Example(features=tf.train.Features(feature=scan_features))31def split_into_subsequences(data, s):32    """33    Split the input sequence into sublist of size s.34    >>> s = "abcdefg"35    >>> split_into_subsequences(s, 2)36    ['ab', 'cd', 'ef', 'g']37    """38    return [data[x : x + s] for x in range(0, len(data), s)]39def save_scan(writer, scan):40    """Serialize the scan in a tfrecord file.41    If the scan is not e regular volume (the first axis42    has length 0) then return doing nothing.43    """44    if scan.shape[0] < 1:45        return46    example = scan_to_example(scan)47    writer.write(example.SerializeToString())48def convert_nrrd(path_glob, output_dir_name, downsample):49    nrrd_files = [str(f) for f in Path(".").glob(path_glob)]50    nrrd_files = split_into_subsequences(nrrd_files, 10)51    output_dir = Path(output_dir_name)52    output_dir.mkdir()53    for i, chunk in tqdm(54        enumerate(nrrd_files, start=1), total=len(nrrd_files)55    ):56        tfrecord_fname = str(output_dir / f"{i:02}.tfrecord")57        with tf.io.TFRecordWriter(tfrecord_fname) as writer:58            for fname in chunk:59                scan, _ = nrrd.read(fname, index_order="C")60                scan = preprocess_scan(scan, downsample)61                save_scan(writer, scan)62def convert_dicom(path_glob, output_dir_name, downsample):63    dcm_directories = list(Path(".").glob(path_glob))64    dcm_directories = split_into_subsequences(dcm_directories, 10)65    output_dir = Path(output_dir_name)66    output_dir.mkdir()67    for i, chunk in tqdm(68        enumerate(dcm_directories, start=1), total=len(dcm_directories)69    ):70        tfrecord_fname = str(output_dir / f"{i:02}.tfrecord")71        with tf.io.TFRecordWriter(tfrecord_fname) as writer:72            for dcm_dir in chunk:73                dcm_files = Path(dcm_dir).glob("*.dcm")74                dcm_slices = [dcmread(str(f)) for f in dcm_files]75                is_volume = (76                    hasattr(dcm_slices[0], "SliceLocation")77                    and len(dcm_slices) >= 478                )79                if is_volume:80                    dcm_slices = sorted(81                        dcm_slices, key=lambda x: x.SliceLocation82                    )83                    if all(84                        s.pixel_array.shape == (512, 512) for s in dcm_slices85                    ):86                        scan = np.stack([s.pixel_array for s in dcm_slices])87                        scan = preprocess_scan(scan, downsample)88                        save_scan(writer, scan)89def convert_nifti(path_glob, output_dir_name, downsample):90    nifti_files = [str(f) for f in Path(".").glob(path_glob)]91    nifti_files = split_into_subsequences(nifti_files, 10)92    output_dir = Path(output_dir_name)93    output_dir.mkdir()94    for i, chunk in tqdm(95        enumerate(nifti_files, start=1), total=len(nifti_files)96    ):97        tfrecord_fname = str(output_dir / f"{i:02}.tfrecord")98        with tf.io.TFRecordWriter(tfrecord_fname) as writer:99            for fname in chunk:100                nib_obj = nib.load(fname)101                scan = nib_obj.get_fdata().T  # transpose to obtain C order102                scan = preprocess_scan(scan, downsample)103                save_scan(writer, scan)104if __name__ == "__main__":105    import doctest106    doctest.testmod()107    parser = argparse.ArgumentParser(108        description="Convert multiple CT scans to a single tfrecord file"109    )110    parser.add_argument("file_type", choices=["nrrd", "dicom", "nifti"])111    parser.add_argument(112        "path_glob",113        help="Glob that identifies all the nrrd/dicom/nifti files to convert (must be inside quotes)",114    )115    parser.add_argument(116        "-d", "--downsample", type=float, default=1, help="Downscaling factor"117    )118    parser.add_argument(119        "output_dir",120        help="Name of the directory where to store the tfrecords files",121    )122    args = parser.parse_args()123    if args.file_type == "nrrd":124        convert_nrrd(args.path_glob, args.output_dir, args.downsample)125    if args.file_type == "dicom":126        convert_dicom(args.path_glob, args.output_dir, args.downsample)127    if args.file_type == "nifti":...base.py
Source:base.py  
1# Copyright 2009-2012 Yelp and Contributors2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14import logging15log = logging.getLogger('mrjob.fs')16class Filesystem(object):17    """Some simple filesystem operations that are common across the local18    filesystem, S3, HDFS, and remote machines via SSH. Different runners19    provide functionality for different filesystems via their20    :py:attr:`~mrjob.runner.MRJobRunner.fs` attribute. The ``hadoop`` and21    ``emr`` runners provide support for multiple protocols using22    :py:class:`~mrjob.job.composite.CompositeFilesystem`.23    Protocol support:24    * :py:class:`mrjob.fs.hadoop.HadoopFilesystem`: ``hdfs://``, others25    * :py:class:`mrjob.fs.local.LocalFilesystem`: ``/``26    * :py:class:`mrjob.fs.s3.S3Filesystem`: ``s3://bucket/path``,27      ``s3n://bucket/path``28    * :py:class:`mrjob.fs.ssh.SSHFilesystem`: ``ssh://hostname/path``29    """30    def cat(self, path_glob):31        """cat all files matching **path_glob**, decompressing if necessary"""32        for filename in self.ls(path_glob):33            for line in self._cat_file(filename):34                yield line35    def du(self, path_glob):36        """Get the total size of files matching ``path_glob``37        Corresponds roughly to: ``hadoop fs -dus path_glob``38        """39        raise NotImplementedError40    def ls(self, path_glob):41        """Recursively list all files in the given path.42        We don't return directories for compatibility with S3 (which43        has no concept of them)44        Corresponds roughly to: ``hadoop fs -lsr path_glob``45        """46        raise NotImplementedError47    def _cat_file(self, path):48        raise NotImplementedError49    def mkdir(self, path):50        """Create the given dir and its subdirs (if they don't already51        exist).52        Corresponds roughly to: ``hadoop fs -mkdir path``53        """54        raise NotImplementedError55    def path_exists(self, path_glob):56        """Does the given path exist?57        Corresponds roughly to: ``hadoop fs -test -e path_glob``58        """59        raise NotImplementedError60    def path_join(self, dirname, filename):61        raise NotImplementedError62    def rm(self, path_glob):63        """Recursively delete the given file/directory, if it exists64        Corresponds roughly to: ``hadoop fs -rmr path_glob``65        """66        raise NotImplementedError67    def touchz(self, path):68        """Make an empty file in the given location. Raises an error if69        a non-zero length file already exists in that location.70        Correponds to: ``hadoop fs -touchz path``71        """72        raise NotImplementedError73    def md5sum(self, path_glob):74        """Generate the md5 sum of the file at ``path``"""...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
