Test your AI Agents with the all-new Agent to Agent Testing Platform.Learn More

How to use safe_isinstance method in Slash

Best Python code snippet using slash

_tree.py

Source:_tree.py

...77        See `Tree explainer examples <https://shap.readthedocs.io/en/latest/api_examples/explainers/Tree.html>`_78        """79        if feature_names is not None:80            self.data_feature_names=feature_names81        elif safe_isinstance(data, "pandas.core.frame.DataFrame"):82            self.data_feature_names = list(data.columns)83        masker = data84        super(Tree, self).__init__(model, masker, feature_names=feature_names)85        if type(self.masker) is maskers.Independent:86            data = self.masker.data87        elif masker is not None:88            raise Exception("Unsupported masker type: %s!" % str(type(self.masker)))89        if getattr(self.masker, "clustering", None) is not None:90            raise Exception("TreeExplainer does not support clustered data inputs! Please use shap.Explainer or pass an unclustered masker!")91        # check for deprecated options92        if model_output == "margin":93            warnings.warn("model_output = \"margin\" has been renamed to model_output = \"raw\"")94            model_output = "raw"95        if model_output == "logloss":96            warnings.warn("model_output = \"logloss\" has been renamed to model_output = \"log_loss\"")97            model_output = "log_loss"98        if "feature_dependence" in deprecated_options:99            dep_val = deprecated_options["feature_dependence"]100            if dep_val == "independent" and feature_perturbation == "interventional":101                warnings.warn("feature_dependence = \"independent\" has been renamed to feature_perturbation" \102                    " = \"interventional\"! See GitHub issue #882.")103            elif feature_perturbation != "interventional":104                warnings.warn("feature_dependence = \"independent\" has been renamed to feature_perturbation" \105                    " = \"interventional\", you can't supply both options! See GitHub issue #882.")106            if dep_val == "tree_path_dependent" and feature_perturbation == "interventional":107                raise Exception("The feature_dependence option has been renamed to feature_perturbation! " \108                    "Please update the option name before calling TreeExplainer. See GitHub issue #882.")109        if feature_perturbation == "independent":110            raise Exception("feature_perturbation = \"independent\" is not a valid option value, please use " \111                "feature_perturbation = \"interventional\" instead. See GitHub issue #882.")112        if safe_isinstance(data, "pandas.core.frame.DataFrame"):113            self.data = data.values114        elif isinstance(data, DenseData):115            self.data = data.data116        else:117            self.data = data118        if self.data is None:119            feature_perturbation = "tree_path_dependent"120            #warnings.warn("Setting feature_perturbation = \"tree_path_dependent\" because no background data was given.")121        elif feature_perturbation == "interventional" and self.data.shape[0] > 1000:122                warnings.warn("Passing "+str(self.data.shape[0]) + " background samples may lead to slow runtimes. Consider "123                    "using shap.sample(data, 100) to create a smaller background data set.")124        self.data_missing = None if self.data is None else pd.isna(self.data)125        self.feature_perturbation = feature_perturbation126        self.expected_value = None127        self.model = TreeEnsemble(model, self.data, self.data_missing, model_output)128        self.model_output = model_output129        #self.model_output = self.model.model_output # this allows the TreeEnsemble to translate model outputs types by how it loads the model130        if feature_perturbation not in feature_perturbation_codes:131            raise ValueError("Invalid feature_perturbation option!")132        # check for unsupported combinations of feature_perturbation and model_outputs133        if feature_perturbation == "tree_path_dependent":134            if self.model.model_output != "raw":135                raise ValueError("Only model_output=\"raw\" is supported for feature_perturbation=\"tree_path_dependent\"")136        elif data is None:137            raise ValueError("A background dataset must be provided unless you are using feature_perturbation=\"tree_path_dependent\"!")138        if self.model.model_output != "raw":139            if self.model.objective is None and self.model.tree_output is None:140                raise Exception("Model does not have a known objective or output type! When model_output is " \141                                "not \"raw\" then we need to know the model's objective or link function.")142        # A bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs143        if safe_isinstance(model, "xgboost.sklearn.XGBClassifier") and self.model.model_output != "raw":144            import xgboost145            if LooseVersion(xgboost.__version__) < LooseVersion('0.81'):146                raise RuntimeError("A bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs! Please upgrade to XGBoost >= v0.81!")147        # compute the expected value if we have a parsed tree for the cext148        if self.model.model_output == "log_loss":149            self.expected_value = self.__dynamic_expected_value150        elif data is not None:151            try:152                self.expected_value = self.model.predict(self.data).mean(0)153            except ValueError:154                raise Exception("Currently TreeExplainer can only handle models with categorical splits when " \155                                "feature_perturbation=\"tree_path_dependent\" and no background data is passed. Please try again using " \156                                "shap.TreeExplainer(model, feature_perturbation=\"tree_path_dependent\").")157            if hasattr(self.expected_value, '__len__') and len(self.expected_value) == 1:158                self.expected_value = self.expected_value[0]159        elif hasattr(self.model, "node_sample_weight"):160            self.expected_value = self.model.values[:,0].sum(0)161            if self.expected_value.size == 1:162                self.expected_value = self.expected_value[0]163            self.expected_value += self.model.base_offset164            if self.model.model_output != "raw":165                self.expected_value = None # we don't handle transforms in this case right now...166        # if our output format requires binary classification to be represented as two outputs then we do that here167        if self.model.model_output == "probability_doubled" and self.expected_value is not None:168            self.expected_value = [1-self.expected_value, self.expected_value]169    def __dynamic_expected_value(self, y):170        """ This computes the expected value conditioned on the given label value.171        """172        return self.model.predict(self.data, np.ones(self.data.shape[0]) * y).mean(0)173    def __call__(self, X, y=None, interactions=False, check_additivity=True):174        if safe_isinstance(X, "pandas.core.frame.DataFrame"):175            feature_names = list(X.columns)176            X = X.values177        else:178            feature_names = getattr(self, "data_feature_names", None)179        if not interactions:180            v = self.shap_values(X, y=y, from_call=True, check_additivity=check_additivity)181            output_shape = tuple()182            if type(v) is list:183                output_shape = (len(v),)184                v = np.stack(v, axis=-1) # put outputs at the end185            # the explanation object expects an expected value for each row186            if hasattr(self.expected_value, "__len__"):187                ev_tiled = np.tile(self.expected_value, (v.shape[0],1))188            else:189                ev_tiled = np.tile(self.expected_value, v.shape[0])190            e = Explanation(v, base_values=ev_tiled, data=X, feature_names=feature_names)191        else:192            v = self.shap_interaction_values(X)193            e = Explanation(v, base_values=self.expected_value, data=X, feature_names=feature_names, interaction_order=2)194        return e195    def _validate_inputs(self, X, y, tree_limit, check_additivity):196        # see if we have a default tree_limit in place.197        if tree_limit is None:198            tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit199        if tree_limit < 0 or tree_limit > self.model.values.shape[0]:200            tree_limit = self.model.values.shape[0]201        # convert dataframes202        if safe_isinstance(X, "pandas.core.series.Series"):203            X = X.values204        elif safe_isinstance(X, "pandas.core.frame.DataFrame"):205            X = X.values206        flat_output = False207        if len(X.shape) == 1:208            flat_output = True209            X = X.reshape(1, X.shape[0])210        if X.dtype != self.model.input_dtype:211            X = X.astype(self.model.input_dtype)212        X_missing = np.isnan(X, dtype=np.bool)213        assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))214        assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"215        if self.model.model_output == "log_loss":216            assert y is not None, "Both samples and labels must be provided when model_output = " \217                                  "\"log_loss\" (i.e. `explainer.shap_values(X, y)`)!"218            assert X.shape[0] == len(219                y), "The number of labels (%d) does not match the number of samples to explain (" \220                    "%d)!" % (221                        len(y), X.shape[0])222        if self.feature_perturbation == "tree_path_dependent":223            assert self.model.fully_defined_weighting, "The background dataset you provided does " \224                                                       "not cover all the leaves in the model, " \225                                                       "so TreeExplainer cannot run with the " \226                                                       "feature_perturbation=\"tree_path_dependent\" option! " \227                                                       "Try providing a larger background " \228                                                       "dataset, or using " \229                                                       "feature_perturbation=\"interventional\"."230        if check_additivity and self.model.model_type == "pyspark":231            warnings.warn(232                "check_additivity requires us to run predictions which is not supported with "233                "spark, "234                "ignoring."235                " Set check_additivity=False to remove this warning")236            check_additivity = False237        return X, y, X_missing, flat_output, tree_limit, check_additivity238    def shap_values(self, X, y=None, tree_limit=None, approximate=False, check_additivity=True, from_call=False):239        """ Estimate the SHAP values for a set of samples.240        Parameters241        ----------242        X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)243            A matrix of samples (# samples x # features) on which to explain the model's output.244        y : numpy.array245            An array of label values for each sample. Used when explaining loss functions.246        tree_limit : None (default) or int247            Limit the number of trees used by the model. By default None means no use the limit of the248            original model, and -1 means no limit.249        approximate : bool250            Run fast, but only roughly approximate the Tree SHAP values. This runs a method251            previously proposed by Saabas which only considers a single feature ordering. Take care252            since this does not have the consistency guarantees of Shapley values and places too253            much weight on lower splits in the tree.254        check_additivity : bool255            Run a validation check that the sum of the SHAP values equals the output of the model. This256            check takes only a small amount of time, and will catch potential unforeseen errors.257            Note that this check only runs right now when explaining the margin of the model.258        Returns259        -------260        array or list261            For models with a single output this returns a matrix of SHAP values262            (# samples x # features). Each row sums to the difference between the model output for that263            sample and the expected value of the model output (which is stored in the expected_value264            attribute of the explainer when it is constant). For models with vector outputs this returns265            a list of such matrices, one for each output.266        """267        # see if we have a default tree_limit in place.268        if tree_limit is None:269            tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit270        # shortcut using the C++ version of Tree SHAP in XGBoost, LightGBM, and CatBoost271        if self.feature_perturbation == "tree_path_dependent" and self.model.model_type != "internal" and self.data is None:272            model_output_vals = None273            phi = None274            if self.model.model_type == "xgboost":275                import xgboost276                if not isinstance(X, xgboost.core.DMatrix):277                    X = xgboost.DMatrix(X)278                if tree_limit == -1:279                    tree_limit = 0280                try:281                    phi = self.model.original_model.predict(282                        X, ntree_limit=tree_limit, pred_contribs=True,283                        approx_contribs=approximate, validate_features=False284                    )285                except ValueError as e:286                        raise ValueError("This reshape error is often caused by passing a bad data matrix to SHAP. " \287                                         "See https://github.com/slundberg/shap/issues/580") from e288                if check_additivity and self.model.model_output == "raw":289                    model_output_vals = self.model.original_model.predict(290                        X, ntree_limit=tree_limit, output_margin=True,291                        validate_features=False292                    )293            elif self.model.model_type == "lightgbm":294                assert not approximate, "approximate=True is not supported for LightGBM models!"295                phi = self.model.original_model.predict(X, num_iteration=tree_limit, pred_contrib=True)296                # Note: the data must be joined on the last axis297                if self.model.original_model.params['objective'] == 'binary':298                    if not from_call:299                        warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray')300                    phi = np.concatenate((0-phi, phi), axis=-1)301                if phi.shape[1] != X.shape[1] + 1:302                    try:303                        phi = phi.reshape(X.shape[0], phi.shape[1]//(X.shape[1]+1), X.shape[1]+1)304                    except ValueError as e:305                        raise Exception("This reshape error is often caused by passing a bad data matrix to SHAP. " \306                                         "See https://github.com/slundberg/shap/issues/580") from e307            elif self.model.model_type == "catboost": # thanks to the CatBoost team for implementing this...308                assert not approximate, "approximate=True is not supported for CatBoost models!"309                assert tree_limit == -1, "tree_limit is not yet supported for CatBoost models!"310                import catboost311                if type(X) != catboost.Pool:312                    X = catboost.Pool(X, cat_features=self.model.cat_feature_indices)313                phi = self.model.original_model.get_feature_importance(data=X, fstr_type='ShapValues')314            # note we pull off the last column and keep it as our expected_value315            if phi is not None:316                if len(phi.shape) == 3:317                    self.expected_value = [phi[0, i, -1] for i in range(phi.shape[1])]318                    out = [phi[:, i, :-1] for i in range(phi.shape[1])]319                else:320                    self.expected_value = phi[0, -1]321                    out = phi[:, :-1]322                if check_additivity and model_output_vals is not None:323                    self.assert_additivity(out, model_output_vals)324                return out325        X, y, X_missing, flat_output, tree_limit, check_additivity = self._validate_inputs(X, y,326                                                                                           tree_limit,327                                                                                           check_additivity)328        transform = self.model.get_transform()329        # run the core algorithm using the C extension330        assert_import("cext")331        phi = np.zeros((X.shape[0], X.shape[1]+1, self.model.num_outputs))332        if not approximate:333            _cext.dense_tree_shap(334                self.model.children_left, self.model.children_right, self.model.children_default,335                self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight,336                self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,337                self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],338                output_transform_codes[transform], False339            )340        else:341            _cext.dense_tree_saabas(342                self.model.children_left, self.model.children_right, self.model.children_default,343                self.model.features, self.model.thresholds, self.model.values,344                self.model.max_depth, tree_limit, self.model.base_offset, output_transform_codes[transform],345                X, X_missing, y, phi346            )347        out = self._get_shap_output(phi, flat_output)348        if check_additivity and self.model.model_output == "raw":349            self.assert_additivity(out, self.model.predict(X))350        return out351    # we pull off the last column and keep it as our expected_value352    def _get_shap_output(self, phi, flat_output):353        if self.model.num_outputs == 1:354            if self.expected_value is None and self.model.model_output != "log_loss":355                self.expected_value = phi[0, -1, 0]356            if flat_output:357                out = phi[0, :-1, 0]358            else:359                out = phi[:, :-1, 0]360        else:361            if self.expected_value is None and self.model.model_output != "log_loss":362                self.expected_value = [phi[0, -1, i] for i in range(phi.shape[2])]363            if flat_output:364                out = [phi[0, :-1, i] for i in range(self.model.num_outputs)]365            else:366                out = [phi[:, :-1, i] for i in range(self.model.num_outputs)]367        # if our output format requires binary classificaiton to be represented as two outputs then we do that here368        if self.model.model_output == "probability_doubled":369            out = [-out, out]370        return out371    def shap_interaction_values(self, X, y=None, tree_limit=None):372        """ Estimate the SHAP interaction values for a set of samples.373        Parameters374        ----------375        X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)376            A matrix of samples (# samples x # features) on which to explain the model's output.377        y : numpy.array378            An array of label values for each sample. Used when explaining loss functions (not yet supported).379        tree_limit : None (default) or int380            Limit the number of trees used by the model. By default None means no use the limit of the381            original model, and -1 means no limit.382        Returns383        -------384        array or list385            For models with a single output this returns a tensor of SHAP values386            (# samples x # features x # features). The matrix (# features x # features) for each sample sums387            to the difference between the model output for that sample and the expected value of the model output388            (which is stored in the expected_value attribute of the explainer). Each row of this matrix sums to the389            SHAP value for that feature for that sample. The diagonal entries of the matrix represent the390            "main effect" of that feature on the prediction and the symmetric off-diagonal entries represent the391            interaction effects between all pairs of features for that sample. For models with vector outputs392            this returns a list of tensors, one for each output.393        """394        assert self.model.model_output == "raw", "Only model_output = \"raw\" is supported for SHAP interaction values right now!"395        #assert self.feature_perturbation == "tree_path_dependent", "Only feature_perturbation = \"tree_path_dependent\" is supported for SHAP interaction values right now!"396        transform = "identity"397        # see if we have a default tree_limit in place.398        if tree_limit is None:399            tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit400        # shortcut using the C++ version of Tree SHAP in XGBoost401        if self.model.model_type == "xgboost" and self.feature_perturbation == "tree_path_dependent":402            import xgboost403            if not isinstance(X, xgboost.core.DMatrix):404                X = xgboost.DMatrix(X)405            if tree_limit == -1:406                tree_limit = 0407            phi = self.model.original_model.predict(X, ntree_limit=tree_limit, pred_interactions=True, validate_features=False)408            # note we pull off the last column and keep it as our expected_value409            if len(phi.shape) == 4:410                self.expected_value = [phi[0, i, -1, -1] for i in range(phi.shape[1])]411                return [phi[:, i, :-1, :-1] for i in range(phi.shape[1])]412            else:413                self.expected_value = phi[0, -1, -1]414                return phi[:, :-1, :-1]415        X, y, X_missing, flat_output, tree_limit, _ = self._validate_inputs(X, y, tree_limit, False)416        # run the core algorithm using the C extension417        assert_import("cext")418        phi = np.zeros((X.shape[0], X.shape[1]+1, X.shape[1]+1, self.model.num_outputs))419        _cext.dense_tree_shap(420            self.model.children_left, self.model.children_right, self.model.children_default,421            self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight,422            self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,423            self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],424            output_transform_codes[transform], True425        )426        return self._get_shap_interactions_output(phi,flat_output)427    # we pull off the last column and keep it as our expected_value428    def _get_shap_interactions_output(self, phi, flat_output):429        if self.model.num_outputs == 1:430            self.expected_value = phi[0, -1, -1, 0]431            if flat_output:432                out = phi[0, :-1, :-1, 0]433            else:434                out = phi[:, :-1, :-1, 0]435        else:436            self.expected_value = [phi[0, -1, -1, i] for i in range(phi.shape[3])]437            if flat_output:438                out = [phi[0, :-1, :-1, i] for i in range(self.model.num_outputs)]439            else:440                out = [phi[:, :-1, :-1, i] for i in range(self.model.num_outputs)]441        return out442    def assert_additivity(self, phi, model_output):443        def check_sum(sum_val, model_output):444            diff = np.abs(sum_val - model_output)445            if np.max(diff / (np.abs(sum_val) + 1e-2)) > 1e-2:446                ind = np.argmax(diff)447                err_msg = "Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the " \448                          "explainer is the same shape that the model was trained on. If your data shape is correct " \449                          "then please report this on GitHub."450                if self.feature_perturbation != "interventional":451                    err_msg += " Consider retrying with the feature_perturbation='interventional' option."452                err_msg += " This check failed because for one of the samples the sum of the SHAP values" \453                           " was %f, while the model output was %f. If this difference is acceptable" \454                           " you can set check_additivity=False to disable this check." % (sum_val[ind], model_output[ind])455                raise Exception(err_msg)456        if type(phi) is list:457            for i in range(len(phi)):458                check_sum(self.expected_value[i] + phi[i].sum(-1), model_output[:,i])459        else:460            check_sum(self.expected_value + phi.sum(-1), model_output)461    @staticmethod462    def supports_model_with_masker(model, masker):463        """ Determines if this explainer can handle the given model.464        This is an abstract static method meant to be implemented by each subclass.465        """466        if not isinstance(masker, (maskers.Independent)) and masker is not None:467            return False468        try:469            TreeEnsemble(model)470        except:471            return False472        return True473class TreeEnsemble:474    """ An ensemble of decision trees.475    This object provides a common interface to many different types of models.476    """477    def __init__(self, model, data=None, data_missing=None, model_output=None):478        self.model_type = "internal"479        self.trees = None480        self.base_offset = 0481        self.model_output = model_output482        self.objective = None # what we explain when explaining the loss of the model483        self.tree_output = None # what are the units of the values in the leaves of the trees484        self.internal_dtype = np.float64485        self.input_dtype = np.float64 # for sklearn we need to use np.float32 to always get exact matches to their predictions486        self.data = data487        self.data_missing = data_missing488        self.fully_defined_weighting = True # does the background dataset land in every leaf (making it valid for the tree_path_dependent method)489        self.tree_limit = None # used for limiting the number of trees we use by default (like from early stopping)490        self.num_stacked_models = 1 # If this is greater than 1 it means we have multiple stacked models with the same number of trees in each model (XGBoost multi-output style)491        self.cat_feature_indices = None # If this is set it tells us which features are treated categorically492        # we use names like keras493        objective_name_map = {494            "mse": "squared_error",495            "variance": "squared_error",496            "friedman_mse": "squared_error",497            "reg:linear": "squared_error",498            "reg:squarederror": "squared_error",499            "regression": "squared_error",500            "regression_l2": "squared_error",501            "mae": "absolute_error",502            "gini": "binary_crossentropy",503            "entropy": "binary_crossentropy",504            "reg:logistic": "binary_crossentropy",505            "binary:logistic": "binary_crossentropy",506            "binary_logloss": "binary_crossentropy",507            "binary": "binary_crossentropy"508        }509        tree_output_name_map = {510            "regression": "raw_value",511            "regression_l2": "squared_error",512            "reg:linear": "raw_value",513            "reg:squarederror": "raw_value",514            "reg:logistic": "log_odds",515            "binary:logistic": "log_odds",516            "binary_logloss": "log_odds",517            "binary": "log_odds"518        }519        if type(model) is dict and "trees" in model:520            # This allows a dictionary to be passed that represents the model.521            # this dictionary has several numerica paramters and also a list of trees522            # where each tree is a dictionary describing that tree523            if "internal_dtype" in model:524                self.internal_dtype = model["internal_dtype"]525            if "input_dtype" in model:526                self.input_dtype = model["input_dtype"]527            if "objective" in model:528                self.objective = model["objective"]529            if "tree_output" in model:530                self.tree_output = model["tree_output"]531            if "base_offset" in model:532                self.base_offset = model["base_offset"]533            self.trees = [SingleTree(t, data=data, data_missing=data_missing) for t in model["trees"]]534        elif type(model) is list and type(model[0]) == SingleTree: # old-style direct-load format535            self.trees = model536        elif safe_isinstance(model, ["sklearn.ensemble.RandomForestRegressor", "sklearn.ensemble.forest.RandomForestRegressor", "econml.grf._base_grf.BaseGRF"]):537            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"538            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type539            self.input_dtype = np.float32540            scaling = 1.0 / len(model.estimators_) # output is average of trees541            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]542            self.objective = objective_name_map.get(model.criterion, None)543            self.tree_output = "raw_value"544        elif safe_isinstance(model, ["sklearn.ensemble.IsolationForest", "sklearn.ensemble._iforest.IsolationForest"]):545            self.dtype = np.float32546            scaling = 1.0 / len(model.estimators_) # output is average of trees547            self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in zip(model.estimators_, model.estimators_features_)]548            self.tree_output = "raw_value"549        elif safe_isinstance(model, ["pyod.models.iforest.IForest"]):550            self.dtype = np.float32551            scaling = 1.0 / len(model.estimators_) # output is average of trees552            self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in zip(model.detector_.estimators_, model.detector_.estimators_features_)]553            self.tree_output = "raw_value"554        elif safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor"):555            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"556            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type557            self.input_dtype = np.float32558            scaling = 1.0 / len(model.estimators_) # output is average of trees559            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]560            self.objective = objective_name_map.get(model.criterion, None)561            self.tree_output = "raw_value"562        elif safe_isinstance(model, "sklearn.ensemble.AdaBoostRegressor"):563            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"564            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type565            self.input_dtype = np.float32566            scaling = 1.0 / len(model.estimators_) # output is average of trees567            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]568            self.objective = objective_name_map.get(model.base_estimator_.criterion, None)569            self.tree_output = "raw_value"570        elif safe_isinstance(model, ["sklearn.ensemble.ExtraTreesRegressor", "sklearn.ensemble.forest.ExtraTreesRegressor"]):571            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"572            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type573            self.input_dtype = np.float32574            scaling = 1.0 / len(model.estimators_) # output is average of trees575            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]576            self.objective = objective_name_map.get(model.criterion, None)577            self.tree_output = "raw_value"578        elif safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor"):579            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"580            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type581            self.input_dtype = np.float32582            scaling = 1.0 / len(model.estimators_) # output is average of trees583            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]584            self.objective = objective_name_map.get(model.criterion, None)585            self.tree_output = "raw_value"586        elif safe_isinstance(model, ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor", "econml.grf._base_grftree.GRFTree"]):587            self.internal_dtype = model.tree_.value.dtype.type588            self.input_dtype = np.float32589            self.trees = [SingleTree(model.tree_, data=data, data_missing=data_missing)]590            self.objective = objective_name_map.get(model.criterion, None)591            self.tree_output = "raw_value"592        elif safe_isinstance(model, ["sklearn.tree.DecisionTreeClassifier", "sklearn.tree.tree.DecisionTreeClassifier"]):593            self.internal_dtype = model.tree_.value.dtype.type594            self.input_dtype = np.float32595            self.trees = [SingleTree(model.tree_, normalize=True, data=data, data_missing=data_missing)]596            self.objective = objective_name_map.get(model.criterion, None)597            self.tree_output = "probability"598        elif safe_isinstance(model, ["sklearn.ensemble.RandomForestClassifier", "sklearn.ensemble.forest.RandomForestClassifier"]):599            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"600            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type601            self.input_dtype = np.float32602            scaling = 1.0 / len(model.estimators_) # output is average of trees603            self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]604            self.objective = objective_name_map.get(model.criterion, None)605            self.tree_output = "probability"606        elif safe_isinstance(model, ["sklearn.ensemble.AdaBoostClassifier", "sklearn.ensemble._weighted_boosting.AdaBoostClassifier"]):607            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"608            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type609            self.input_dtype = np.float32610            scaling = 1.0 / len(model.estimators_) # output is average of trees611            self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling) for e in model.estimators_]612            self.objective = objective_name_map.get(model.base_estimator_.criterion, None) #This line is done to get the decision criteria, for example gini.613            self.tree_output = "probability" #This is the last line added614        elif safe_isinstance(model, ["sklearn.ensemble.ExtraTreesClassifier", "sklearn.ensemble.forest.ExtraTreesClassifier"]): # TODO: add unit test for this case615            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"616            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type617            self.input_dtype = np.float32618            scaling = 1.0 / len(model.estimators_) # output is average of trees619            self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]620            self.objective = objective_name_map.get(model.criterion, None)621            self.tree_output = "probability"622        elif safe_isinstance(model, ["sklearn.ensemble.GradientBoostingRegressor", "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"]):623            self.input_dtype = np.float32624            # currently we only support the mean and quantile estimators625            if safe_isinstance(model.init_, ["sklearn.ensemble.MeanEstimator", "sklearn.ensemble.gradient_boosting.MeanEstimator"]):626                self.base_offset = model.init_.mean627            elif safe_isinstance(model.init_, ["sklearn.ensemble.QuantileEstimator", "sklearn.ensemble.gradient_boosting.QuantileEstimator"]):628                self.base_offset = model.init_.quantile629            elif safe_isinstance(model.init_, "sklearn.dummy.DummyRegressor"):630                self.base_offset = model.init_.constant_[0]631            else:632                assert False, "Unsupported init model type: " + str(type(model.init_))633            self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]]634            self.objective = objective_name_map.get(model.criterion, None)635            self.tree_output = "raw_value"636        elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingRegressor"]):637            import sklearn638            if self.model_output == "predict":639                self.model_output = "raw"640            self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE641            self.base_offset = model._baseline_prediction642            self.trees = []643            for p in model._predictors:644                nodes = p[0].nodes645                # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold')646                tree = {647                    "children_left": np.array([-1 if n[9] else n[5] for n in nodes]),648                    "children_right": np.array([-1 if n[9] else n[6] for n in nodes]),649                    "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]),650                    "features": np.array([-2 if n[9] else n[2] for n in nodes]),651                    "thresholds": np.array([n[3] for n in nodes], dtype=np.float64),652                    "values": np.array([[n[0]] for n in nodes], dtype=np.float64),653                    "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64),654                }655                self.trees.append(SingleTree(tree, data=data, data_missing=data_missing))656            self.objective = objective_name_map.get(model.loss, None)657            self.tree_output = "raw_value"658        elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingClassifier"]):659            import sklearn660            self.base_offset = model._baseline_prediction661            if hasattr(self.base_offset, "__len__") and self.model_output != "raw":662                raise Exception("Multi-output HistGradientBoostingClassifier models are not yet supported unless model_output=\"raw\". See GitHub issue #1028")663            self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE664            self.num_stacked_models = len(model._predictors[0])665            if self.model_output == "predict_proba":666                if self.num_stacked_models == 1:667                    self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match668                else:669                    self.model_output = "probability"670            self.trees = []671            for p in model._predictors:672                for i in range(self.num_stacked_models):673                    nodes = p[i].nodes674                    # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold')675                    tree = {676                        "children_left": np.array([-1 if n[9] else n[5] for n in nodes]),677                        "children_right": np.array([-1 if n[9] else n[6] for n in nodes]),678                        "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]),679                        "features": np.array([-2 if n[9] else n[2] for n in nodes]),680                        "thresholds": np.array([n[3] for n in nodes], dtype=np.float64),681                        "values": np.array([[n[0]] for n in nodes], dtype=np.float64),682                        "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64),683                    }684                    self.trees.append(SingleTree(tree, data=data, data_missing=data_missing))685            self.objective = objective_name_map.get(model.loss, None)686            self.tree_output = "log_odds"687        elif safe_isinstance(model, ["sklearn.ensemble.GradientBoostingClassifier","sklearn.ensemble._gb.GradientBoostingClassifier", "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"]):688            self.input_dtype = np.float32689            # TODO: deal with estimators for each class690            if model.estimators_.shape[1] > 1:691                assert False, "GradientBoostingClassifier is only supported for binary classification right now!"692            # currently we only support the logs odds estimator693            if safe_isinstance(model.init_, ["sklearn.ensemble.LogOddsEstimator", "sklearn.ensemble.gradient_boosting.LogOddsEstimator"]):694                self.base_offset = model.init_.prior695                self.tree_output = "log_odds"696            elif safe_isinstance(model.init_, "sklearn.dummy.DummyClassifier"):697                self.base_offset = scipy.special.logit(model.init_.class_prior_[1]) # with two classes the trees only model the second class. # pylint: disable=no-member698                self.tree_output = "log_odds"699            else:700                assert False, "Unsupported init model type: " + str(type(model.init_))701            self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]]702            self.objective = objective_name_map.get(model.criterion, None)703        elif "pyspark.ml" in str(type(model)):704            assert_import("pyspark")705            self.model_type = "pyspark"706            # model._java_obj.getImpurity() can be gini, entropy or variance.707            self.objective = objective_name_map.get(model._java_obj.getImpurity(), None)708            if "Classification" in str(type(model)):709                normalize = True710                self.tree_output = "probability"711            else:712                normalize = False713                self.tree_output = "raw_value"714            # Spark Random forest, create 1 weighted (avg) tree per sub-model715            if safe_isinstance(model, "pyspark.ml.classification.RandomForestClassificationModel") \716                    or safe_isinstance(model, "pyspark.ml.regression.RandomForestRegressionModel"):717                sum_weight = sum(model.treeWeights)  # output is average of trees718                self.trees = [SingleTree(tree, normalize=normalize, scaling=model.treeWeights[i]/sum_weight) for i, tree in enumerate(model.trees)]719            # Spark GBT, create 1 weighted (learning rate) tree per sub-model720            elif safe_isinstance(model, "pyspark.ml.classification.GBTClassificationModel") \721                    or safe_isinstance(model, "pyspark.ml.regression.GBTRegressionModel"):722                self.objective = "squared_error" # GBT subtree use the variance723                self.tree_output = "raw_value"724                self.trees = [SingleTree(tree, normalize=False, scaling=model.treeWeights[i]) for i, tree in enumerate(model.trees)]725            # Spark Basic model (single tree)726            elif safe_isinstance(model, "pyspark.ml.classification.DecisionTreeClassificationModel") \727                    or safe_isinstance(model, "pyspark.ml.regression.DecisionTreeRegressionModel"):728                self.trees = [SingleTree(model, normalize=normalize, scaling=1)]729            else:730                assert False, "Unsupported Spark model type: " + str(type(model))731        elif safe_isinstance(model, "xgboost.core.Booster"):732            import xgboost733            self.original_model = model734            self.model_type = "xgboost"735            xgb_loader = XGBTreeModelLoader(self.original_model)736            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)737            self.base_offset = xgb_loader.base_score738            self.objective = objective_name_map.get(xgb_loader.name_obj, None)739            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)740            if xgb_loader.num_class > 0:741                self.num_stacked_models = xgb_loader.num_class742        elif safe_isinstance(model, "xgboost.sklearn.XGBClassifier"):743            import xgboost744            self.input_dtype = np.float32745            self.model_type = "xgboost"746            self.original_model = model.get_booster()747            xgb_loader = XGBTreeModelLoader(self.original_model)748            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)749            self.base_offset = xgb_loader.base_score750            self.objective = objective_name_map.get(xgb_loader.name_obj, None)751            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)752            self.tree_limit = getattr(model, "best_ntree_limit", None)753            if xgb_loader.num_class > 0:754                self.num_stacked_models = xgb_loader.num_class755            if self.model_output == "predict_proba":756                if self.num_stacked_models == 1:757                    self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match758                else:759                    self.model_output = "probability"760        elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"):761            import xgboost762            self.original_model = model.get_booster()763            self.model_type = "xgboost"764            xgb_loader = XGBTreeModelLoader(self.original_model)765            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)766            self.base_offset = xgb_loader.base_score767            self.objective = objective_name_map.get(xgb_loader.name_obj, None)768            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)769            self.tree_limit = getattr(model, "best_ntree_limit", None)770            if xgb_loader.num_class > 0:771                self.num_stacked_models = xgb_loader.num_class772        elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"):773            import xgboost774            self.original_model = model.get_booster()775            self.model_type = "xgboost"776            xgb_loader = XGBTreeModelLoader(self.original_model)777            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)778            self.base_offset = xgb_loader.base_score779            # Note: for ranker, leaving tree_output and objective as None as they780            # are not implemented in native code yet781            self.tree_limit = getattr(model, "best_ntree_limit", None)782            if xgb_loader.num_class > 0:783                self.num_stacked_models = xgb_loader.num_class784        elif safe_isinstance(model, "lightgbm.basic.Booster"):785            assert_import("lightgbm")786            self.model_type = "lightgbm"787            self.original_model = model788            tree_info = self.original_model.dump_model()["tree_info"]789            try:790                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]791            except:792                self.trees = None # we get here because the cext can't handle categorical splits yet793            self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)794            self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)795        elif safe_isinstance(model, "gpboost.basic.Booster"):796            assert_import("gpboost")797            self.model_type = "gpboost"798            self.original_model = model799            tree_info = self.original_model.dump_model()["tree_info"]800            try:801                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]802            except:803                self.trees = None # we get here because the cext can't handle categorical splits yet804            self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)805            self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)806        elif safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor"):807            assert_import("lightgbm")808            self.model_type = "lightgbm"809            self.original_model = model.booster_810            tree_info = self.original_model.dump_model()["tree_info"]811            try:812                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]813            except:814                self.trees = None # we get here because the cext can't handle categorical splits yet815            self.objective = objective_name_map.get(model.objective, None)816            self.tree_output = tree_output_name_map.get(model.objective, None)817            if model.objective is None:818                self.objective = "squared_error"819                self.tree_output = "raw_value"820        elif safe_isinstance(model, "lightgbm.sklearn.LGBMRanker"):821            assert_import("lightgbm")822            self.model_type = "lightgbm"823            self.original_model = model.booster_824            tree_info = self.original_model.dump_model()["tree_info"]825            try:826                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]827            except:828                self.trees = None # we get here because the cext can't handle categorical splits yet829            # Note: for ranker, leaving tree_output and objective as None as they830            # are not implemented in native code yet831        elif safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier"):832            assert_import("lightgbm")833            self.model_type = "lightgbm"834            if model.n_classes_ > 2:835                self.num_stacked_models = model.n_classes_836            self.original_model = model.booster_837            tree_info = self.original_model.dump_model()["tree_info"]838            try:839                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]840            except:841                self.trees = None # we get here because the cext can't handle categorical splits yet842            self.objective = objective_name_map.get(model.objective, None)843            self.tree_output = tree_output_name_map.get(model.objective, None)844            if model.objective is None:845                self.objective = "binary_crossentropy"846                self.tree_output = "log_odds"847        elif safe_isinstance(model, "catboost.core.CatBoostRegressor"):848            assert_import("catboost")849            self.model_type = "catboost"850            self.original_model = model851            self.cat_feature_indices = model.get_cat_feature_indices()852        elif safe_isinstance(model, "catboost.core.CatBoostClassifier"):853            assert_import("catboost")854            self.model_type = "catboost"855            self.original_model = model856            self.input_dtype = np.float32857            try:858                cb_loader = CatBoostTreeModelLoader(model)859                self.trees = cb_loader.get_trees(data=data, data_missing=data_missing)860            except:861                self.trees = None # we get here because the cext can't handle categorical splits yet862            self.tree_output = "log_odds"863            self.objective = "binary_crossentropy"864            self.cat_feature_indices = model.get_cat_feature_indices()865        elif safe_isinstance(model, "catboost.core.CatBoost"):866            assert_import("catboost")867            self.model_type = "catboost"868            self.original_model = model869            self.cat_feature_indices = model.get_cat_feature_indices()870        elif safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"):871            self.input_dtype = np.float32872            scaling = 1.0 / len(model.estimators_) # output is average of trees873            self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]874            self.objective = objective_name_map.get(model.criterion, None)875            self.tree_output = "probability"876        elif safe_isinstance(model, "ngboost.ngboost.NGBoost") or safe_isinstance(model, "ngboost.api.NGBRegressor") or safe_isinstance(model, "ngboost.api.NGBClassifier"):877            assert model.base_models, "The NGBoost model has empty `base_models`! Have you called `model.fit`?"878            if self.model_output == "raw":879                param_idx = 0 # default to the first parameter of the output distribution880                warnings.warn("Translating model_ouput=\"raw\" to model_output=0 for the 0-th parameter in the distribution. Use model_output=0 directly to avoid this warning.")881            elif type(self.model_output) is int:882                param_idx = self.model_output883                self.model_output = "raw" # note that after loading we have a new model_output type884            assert safe_isinstance(model.base_models[0][param_idx], ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor"]), "You must use default_tree_learner!"885            shap_trees = [trees[param_idx] for trees in model.base_models]886            self.internal_dtype = shap_trees[0].tree_.value.dtype.type887            self.input_dtype = np.float32888            scaling = - model.learning_rate * np.array(model.scalings) # output is weighted average of trees889            self.trees = [SingleTree(e.tree_, scaling=s, data=data, data_missing=data_missing) for e,s in zip(shap_trees,scaling)]890            self.objective = objective_name_map.get(shap_trees[0].criterion, None)891            self.tree_output = "raw_value"892            self.base_offset = model.init_params[param_idx]893        else:894            raise Exception("Model type not yet supported by TreeExplainer: " + str(type(model)))895        # build a dense numpy version of all the tree objects896        if self.trees is not None and self.trees:897            max_nodes = np.max([len(t.values) for t in self.trees])898            assert len(np.unique([t.values.shape[1] for t in self.trees])) == 1, "All trees in the ensemble must have the same output dimension!"899            num_trees = len(self.trees)900            if self.num_stacked_models > 1:901                assert len(self.trees) % self.num_stacked_models == 0, "Only stacked models with equal numbers of trees are supported!"902                assert self.trees[0].values.shape[1] == 1, "Only stacked models with single outputs per model are supported!"903                self.num_outputs = self.num_stacked_models904            else:905                self.num_outputs = self.trees[0].values.shape[1]906            # important to be -1 in unused sections!! This way we can tell which entries are valid.907            self.children_left = -np.ones((num_trees, max_nodes), dtype=np.int32)908            self.children_right = -np.ones((num_trees, max_nodes), dtype=np.int32)909            self.children_default = -np.ones((num_trees, max_nodes), dtype=np.int32)910            self.features = -np.ones((num_trees, max_nodes), dtype=np.int32)911            self.thresholds = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype)912            self.values = np.zeros((num_trees, max_nodes, self.num_outputs), dtype=self.internal_dtype)913            self.node_sample_weight = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype)914            for i in range(num_trees):915                self.children_left[i,:len(self.trees[i].children_left)] = self.trees[i].children_left916                self.children_right[i,:len(self.trees[i].children_right)] = self.trees[i].children_right917                self.children_default[i,:len(self.trees[i].children_default)] = self.trees[i].children_default918                self.features[i,:len(self.trees[i].features)] = self.trees[i].features919                self.thresholds[i,:len(self.trees[i].thresholds)] = self.trees[i].thresholds920                if self.num_stacked_models > 1:921                    # stack_pos = int(i // (num_trees / self.num_stacked_models))922                    stack_pos = i % self.num_stacked_models923                    self.values[i,:len(self.trees[i].values[:,0]),stack_pos] = self.trees[i].values[:,0]924                else:925                    self.values[i,:len(self.trees[i].values)] = self.trees[i].values926                self.node_sample_weight[i,:len(self.trees[i].node_sample_weight)] = self.trees[i].node_sample_weight927                # ensure that the passed background dataset lands in every leaf928                if np.min(self.trees[i].node_sample_weight) <= 0:929                    self.fully_defined_weighting = False930            self.num_nodes = np.array([len(t.values) for t in self.trees], dtype=np.int32)931            self.max_depth = np.max([t.max_depth for t in self.trees])932            # make sure the base offset is a 1D array933            if not hasattr(self.base_offset, "__len__") or len(self.base_offset) == 0:934                self.base_offset = (np.ones(self.num_outputs) * self.base_offset).astype(self.internal_dtype)935            self.base_offset = self.base_offset.flatten()936            assert len(self.base_offset) == self.num_outputs937    def get_transform(self):938        """ A consistent interface to make predictions from this model.939        """940        if self.model_output == "raw":941            transform = "identity"942        elif self.model_output == "probability" or self.model_output == "probability_doubled":943            if self.tree_output == "log_odds":944                transform = "logistic"945            elif self.tree_output == "probability":946                transform = "identity"947            else:948                raise Exception("model_output = \"probability\" is not yet supported when model.tree_output = \"" + self.tree_output + "\"!")949        elif self.model_output == "log_loss":950            if self.objective == "squared_error":951                transform = "squared_loss"952            elif self.objective == "binary_crossentropy":953                transform = "logistic_nlogloss"954            else:955                raise Exception("model_output = \"log_loss\" is not yet supported when model.objective = \"" + self.objective + "\"!")956        else:957            raise Exception("Unrecognized model_output parameter value: %s! If model.%s is a valid function open a github issue to ask that this method be supported. If you want 'predict_proba' just use 'probability' for now." % (str(self.model_output), str(self.model_output)))958        return transform959    def predict(self, X, y=None, output=None, tree_limit=None):960        """ A consistent interface to make predictions from this model.961        Parameters962        ----------963        tree_limit : None (default) or int964            Limit the number of trees used by the model. By default None means no use the limit of the965            original model, and -1 means no limit.966        """967        if output is None:968            output = self.model_output969        if self.model_type == "pyspark":970            #import pyspark971            # TODO: support predict for pyspark972            raise NotImplementedError("Predict with pyspark isn't implemented. Don't run 'interventional' as feature_perturbation.")973        # see if we have a default tree_limit in place.974        if tree_limit is None:975            tree_limit = -1 if self.tree_limit is None else self.tree_limit976        # convert dataframes977        if safe_isinstance(X, "pandas.core.series.Series"):978            X = X.values979        elif safe_isinstance(X, "pandas.core.frame.DataFrame"):980            X = X.values981        flat_output = False982        if len(X.shape) == 1:983            flat_output = True984            X = X.reshape(1, X.shape[0])985        if X.dtype.type != self.input_dtype:986            X = X.astype(self.input_dtype)987        X_missing = np.isnan(X, dtype=np.bool)988        assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))989        assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"990        if tree_limit < 0 or tree_limit > self.values.shape[0]:991            tree_limit = self.values.shape[0]992        if output == "logloss":993            assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"994            assert X.shape[0] == len(y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (len(y), X.shape[0])995        transform = self.get_transform()996        assert_import("cext")997        output = np.zeros((X.shape[0], self.num_outputs))998        _cext.dense_tree_predict(999            self.children_left, self.children_right, self.children_default,1000            self.features, self.thresholds, self.values,1001            self.max_depth, tree_limit, self.base_offset, output_transform_codes[transform],1002            X, X_missing, y, output1003        )1004        # drop dimensions we don't need1005        if flat_output:1006            if self.num_outputs == 1:1007                return output.flatten()[0]1008            else:1009                return output.reshape(-1, self.num_outputs)1010        else:1011            if self.num_outputs == 1:1012                return output.flatten()1013            else:1014                return output1015class SingleTree:1016    """ A single decision tree.1017    The primary point of this object is to parse many different tree types into a common format.1018    """1019    def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None):1020        assert_import("cext")1021        if safe_isinstance(tree, ["sklearn.tree._tree.Tree", "econml.tree._tree.Tree"]):1022            self.children_left = tree.children_left.astype(np.int32)1023            self.children_right = tree.children_right.astype(np.int32)1024            self.children_default = self.children_left # missing values not supported in sklearn1025            self.features = tree.feature.astype(np.int32)1026            self.thresholds = tree.threshold.astype(np.float64)1027            self.values = tree.value.reshape(tree.value.shape[0], tree.value.shape[1] * tree.value.shape[2])1028            if normalize:1029                self.values = (self.values.T / self.values.sum(1)).T1030            self.values = self.values * scaling1031            self.node_sample_weight = tree.weighted_n_node_samples.astype(np.float64)1032        elif type(tree) is dict and 'features' in tree:1033            self.children_left = tree["children_left"].astype(np.int32)1034            self.children_right = tree["children_right"].astype(np.int32)1035            self.children_default = tree["children_default"].astype(np.int32)1036            self.features = tree["features"].astype(np.int32)1037            self.thresholds = tree["thresholds"]1038            self.values = tree["values"] * scaling1039            self.node_sample_weight = tree["node_sample_weight"]1040        # deprecated dictionary support (with sklearn singlular style "feature" and "value" names)1041        elif type(tree) is dict and 'children_left' in tree:1042            self.children_left = tree["children_left"].astype(np.int32)1043            self.children_right = tree["children_right"].astype(np.int32)1044            self.children_default = tree["children_default"].astype(np.int32)1045            self.features = tree["feature"].astype(np.int32)1046            self.thresholds = tree["threshold"]1047            self.values = tree["value"] * scaling1048            self.node_sample_weight = tree["node_sample_weight"]1049        elif safe_isinstance(tree, "pyspark.ml.classification.DecisionTreeClassificationModel") \1050                or safe_isinstance(tree, "pyspark.ml.regression.DecisionTreeRegressionModel"):1051            #model._java_obj.numNodes() doesn't give leaves, need to recompute the size1052            def getNumNodes(node, size):1053                size = size + 11054                if node.subtreeDepth() == 0:1055                    return size1056                else:1057                    size = getNumNodes(node.leftChild(), size)1058                    return getNumNodes(node.rightChild(), size)1059            num_nodes = getNumNodes(tree._java_obj.rootNode(), 0)1060            self.children_left = np.full(num_nodes, -2, dtype=np.int32)1061            self.children_right = np.full(num_nodes, -2, dtype=np.int32)1062            self.children_default = np.full(num_nodes, -2, dtype=np.int32)1063            self.features = np.full(num_nodes, -2, dtype=np.int32)1064            self.thresholds = np.full(num_nodes, -2, dtype=np.float64)1065            self.values = [-2]*num_nodes1066            self.node_sample_weight = np.full(num_nodes, -2, dtype=np.float64)1067            def buildTree(index, node):1068                index = index + 11069                if tree._java_obj.getImpurity() == 'variance':1070                    self.values[index] = [node.prediction()] #prediction for the node1071                else:1072                    self.values[index] = [e for e in node.impurityStats().stats()] #for gini: NDarray(numLabel): 1 per label: number of item for each label which went through this node1073                self.node_sample_weight[index] = node.impurityStats().count() #weighted count of element trough this node1074                if node.subtreeDepth() == 0:1075                    return index1076                else:1077                    self.features[index] = node.split().featureIndex() #index of the feature we split on, not available for leaf, int1078                    if str(node.split().getClass()).endswith('tree.CategoricalSplit'):1079                        #Categorical split isn't implemented, TODO: could fake it by creating a fake node to split on the exact value?1080                        raise NotImplementedError('CategoricalSplit are not yet implemented')1081                    self.thresholds[index] = node.split().threshold() #threshold for the feature, not available for leaf, float1082                    self.children_left[index] = index + 11083                    idx = buildTree(index, node.leftChild())1084                    self.children_right[index] = idx + 11085                    idx = buildTree(idx, node.rightChild())1086                    return idx1087            buildTree(-1, tree._java_obj.rootNode())1088            #default Not supported with mlib? (TODO)1089            self.children_default = self.children_left1090            self.values = np.asarray(self.values)1091            if normalize:1092                self.values = (self.values.T / self.values.sum(1)).T1093            self.values = self.values * scaling1094        elif type(tree) == dict and 'tree_structure' in tree: # LightGBM model dump1095            start = tree['tree_structure']1096            num_parents = tree['num_leaves']-11097            self.children_left = np.empty((2*num_parents+1), dtype=np.int32)1098            self.children_right = np.empty((2*num_parents+1), dtype=np.int32)1099            self.children_default = np.empty((2*num_parents+1), dtype=np.int32)1100            self.features = np.empty((2*num_parents+1), dtype=np.int32)1101            self.thresholds = np.empty((2*num_parents+1), dtype=np.float64)1102            self.values = [-2]*(2*num_parents+1)1103            self.node_sample_weight = np.empty((2*num_parents+1), dtype=np.float64)1104            visited, queue = [], [start]1105            while queue:1106                vertex = queue.pop(0)1107                if 'split_index' in vertex.keys():1108                    if vertex['split_index'] not in visited:1109                        if 'split_index' in vertex['left_child'].keys():1110                            self.children_left[vertex['split_index']] = vertex['left_child']['split_index']1111                        else:1112                            self.children_left[vertex['split_index']] = vertex['left_child']['leaf_index']+num_parents1113                        if 'split_index' in vertex['right_child'].keys():1114                            self.children_right[vertex['split_index']] = vertex['right_child']['split_index']1115                        else:1116                            self.children_right[vertex['split_index']] = vertex['right_child']['leaf_index']+num_parents1117                        if vertex['default_left']:1118                            self.children_default[vertex['split_index']] = self.children_left[vertex['split_index']]1119                        else:1120                            self.children_default[vertex['split_index']] = self.children_right[vertex['split_index']]1121                        self.features[vertex['split_index']] = vertex['split_feature']1122                        self.thresholds[vertex['split_index']] = vertex['threshold']1123                        self.values[vertex['split_index']] = [vertex['internal_value']]1124                        self.node_sample_weight[vertex['split_index']] = vertex['internal_count']1125                        visited.append(vertex['split_index'])1126                        queue.append(vertex['left_child'])1127                        queue.append(vertex['right_child'])1128                else:1129                    self.children_left[vertex['leaf_index']+num_parents] = -11130                    self.children_right[vertex['leaf_index']+num_parents] = -11131                    self.children_default[vertex['leaf_index']+num_parents] = -11132                    self.features[vertex['leaf_index']+num_parents] = -11133                    self.children_left[vertex['leaf_index']+num_parents] = -11134                    self.children_right[vertex['leaf_index']+num_parents] = -11135                    self.children_default[vertex['leaf_index']+num_parents] = -11136                    self.features[vertex['leaf_index']+num_parents] = -11137                    self.thresholds[vertex['leaf_index']+num_parents] = -11138                    self.values[vertex['leaf_index']+num_parents] = [vertex['leaf_value']]1139                    self.node_sample_weight[vertex['leaf_index']+num_parents] = vertex['leaf_count']1140            self.values = np.asarray(self.values)1141            self.values = np.multiply(self.values, scaling)1142        elif type(tree) == dict and 'nodeid' in tree:1143            """ Directly create tree given the JSON dump (with stats) of a XGBoost model.1144            """1145            def max_id(node):1146                if "children" in node:1147                    return max(node["nodeid"], *[max_id(n) for n in node["children"]])1148                else:1149                    return node["nodeid"]1150            m = max_id(tree) + 11151            self.children_left = -np.ones(m, dtype=np.int32)1152            self.children_right = -np.ones(m, dtype=np.int32)1153            self.children_default = -np.ones(m, dtype=np.int32)1154            self.features = -np.ones(m, dtype=np.int32)1155            self.thresholds = np.zeros(m, dtype=np.float64)1156            self.values = np.zeros((m, 1), dtype=np.float64)1157            self.node_sample_weight = np.empty(m, dtype=np.float64)1158            def extract_data(node, tree):1159                i = node["nodeid"]1160                tree.node_sample_weight[i] = node["cover"]1161                if "children" in node:1162                    tree.children_left[i] = node["yes"]1163                    tree.children_right[i] = node["no"]1164                    tree.children_default[i] = node["missing"]1165                    tree.features[i] = node["split"]1166                    tree.thresholds[i] = node["split_condition"]1167                    for n in node["children"]:1168                        extract_data(n, tree)1169                elif "leaf" in node:1170                    tree.values[i] = node["leaf"] * scaling1171            extract_data(tree, self)1172        elif type(tree) == str:1173            """ Build a tree from a text dump (with stats) of xgboost.1174            """1175            nodes = [t.lstrip() for t in tree[:-1].split("\n")]1176            nodes_dict = {}1177            for n in nodes: nodes_dict[int(n.split(":")[0])] = n.split(":")[1]1178            m = max(nodes_dict.keys())+11179            children_left = -1*np.ones(m,dtype="int32")1180            children_right = -1*np.ones(m,dtype="int32")1181            children_default = -1*np.ones(m,dtype="int32")1182            features = -2*np.ones(m,dtype="int32")1183            thresholds = -1*np.ones(m,dtype="float64")1184            values = 1*np.ones(m,dtype="float64")1185            node_sample_weight = np.zeros(m,dtype="float64")1186            values_lst = list(nodes_dict.values())1187            keys_lst = list(nodes_dict.keys())1188            for i in range(0,len(keys_lst)):1189                value = values_lst[i]1190                key = keys_lst[i]1191                if ("leaf" in value):1192                    # Extract values1193                    val = float(value.split("leaf=")[1].split(",")[0])1194                    node_sample_weight_val = float(value.split("cover=")[1])1195                    # Append to lists1196                    values[key] = val1197                    node_sample_weight[key] = node_sample_weight_val1198                else:1199                    c_left = int(value.split("yes=")[1].split(",")[0])1200                    c_right = int(value.split("no=")[1].split(",")[0])1201                    c_default = int(value.split("missing=")[1].split(",")[0])1202                    feat_thres = value.split(" ")[0]1203                    if ("<" in feat_thres):1204                        feature = int(feat_thres.split("<")[0][2:])1205                        threshold = float(feat_thres.split("<")[1][:-1])1206                    if ("=" in feat_thres):1207                        feature = int(feat_thres.split("=")[0][2:])1208                        threshold = float(feat_thres.split("=")[1][:-1])1209                    node_sample_weight_val = float(value.split("cover=")[1].split(",")[0])1210                    children_left[key] = c_left1211                    children_right[key] = c_right1212                    children_default[key] = c_default1213                    features[key] = feature1214                    thresholds[key] = threshold1215                    node_sample_weight[key] = node_sample_weight_val1216            self.children_left = children_left1217            self.children_right = children_right1218            self.children_default = children_default1219            self.features = features1220            self.thresholds = thresholds1221            self.values = values[:,np.newaxis] * scaling1222            self.node_sample_weight = node_sample_weight1223        else:1224            raise Exception("Unknown input to SingleTree constructor: " + str(tree))1225        # Re-compute the number of samples that pass through each node if we are given data1226        if data is not None and data_missing is not None:1227            self.node_sample_weight[:] = 0.01228            _cext.dense_tree_update_weights(1229                self.children_left, self.children_right, self.children_default, self.features,1230                self.thresholds, self.values, 1, self.node_sample_weight, data, data_missing1231            )1232        # we compute the expectations to make sure they follow the SHAP logic1233        self.max_depth = _cext.compute_expectations(1234            self.children_left, self.children_right, self.node_sample_weight,1235            self.values1236        )1237class IsoTree(SingleTree):1238    """1239    In sklearn the tree of the Isolation Forest does not calculated in a good way.1240    """1241    def __init__(self, tree, tree_features, normalize=False, scaling=1.0, data=None, data_missing=None):1242        super(IsoTree, self).__init__(tree, normalize, scaling, data, data_missing)1243        if safe_isinstance(tree, "sklearn.tree._tree.Tree"):1244            from sklearn.ensemble._iforest import _average_path_length # pylint: disable=no-name-in-module1245            def _recalculate_value(tree, i , level):1246                if tree.children_left[i] == -1 and tree.children_right[i] == -1:1247                    value = level + _average_path_length(np.array([tree.n_node_samples[i]]))[0]1248                    self.values[i, 0] =  value1249                    return value * tree.n_node_samples[i]1250                else:1251                    value_left = _recalculate_value(tree, tree.children_left[i] , level + 1)1252                    value_right = _recalculate_value(tree, tree.children_right[i] , level + 1)1253                    self.values[i, 0] =  (value_left + value_right) / tree.n_node_samples[i]1254                    return value_left + value_right1255            _recalculate_value(tree, 0, 0)1256            if normalize:1257                self.values = (self.values.T / self.values.sum(1)).T...

base_tree.py

Source:base_tree.py

...75                self.base_offset = model["base_offset"]76            self.trees = [SingleTree(t, data=data, data_missing=data_missing) for t in model["trees"]]77        elif type(model) is list and type(model[0]) == SingleTree:  # old-style direct-load format78            self.trees = model79        elif safe_isinstance(model,80                             ["sklearn.ensemble.RandomForestRegressor", "sklearn.ensemble.forest.RandomForestRegressor",81                              "econml.grf._base_grf.BaseGRF"]):82            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"83            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type84            self.input_dtype = np.float3285            scaling = 1.0 / len(model.estimators_)  # output is average of trees86            # self.scaling = scaling87            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in88                          model.estimators_]89            self.objective = objective_name_map.get(model.criterion, None)90            self.tree_output = "raw_value"91        elif safe_isinstance(model, ["sklearn.ensemble.IsolationForest", "sklearn.ensemble._iforest.IsolationForest"]):92            self.dtype = np.float3293            scaling = 1.0 / len(model.estimators_)  # output is average of trees94            # self.scaling = scaling95            self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in96                          zip(model.estimators_, model.estimators_features_)]97            self.tree_output = "raw_value"98        elif safe_isinstance(model, ["pyod.models.iforest.IForest"]):99            self.dtype = np.float32100            scaling = 1.0 / len(model.estimators_)  # output is average of trees101            # self.scaling = scaling102            self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in103                          zip(model.detector_.estimators_, model.detector_.estimators_features_)]104            self.tree_output = "raw_value"105        elif safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor"):106            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"107            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type108            self.input_dtype = np.float32109            scaling = 1.0 / len(model.estimators_)  # output is average of trees110            # self.scaling = scaling111            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in112                          model.estimators_]113            self.objective = objective_name_map.get(model.criterion, None)114            self.tree_output = "raw_value"115        elif safe_isinstance(model,116                             ["sklearn.ensemble.ExtraTreesRegressor", "sklearn.ensemble.forest.ExtraTreesRegressor"]):117            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"118            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type119            self.input_dtype = np.float32120            scaling = 1.0 / len(model.estimators_)  # output is average of trees121            # self.scaling = scaling122            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in123                          model.estimators_]124            self.objective = objective_name_map.get(model.criterion, None)125            self.tree_output = "raw_value"126        elif safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor"):127            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"128            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type129            self.input_dtype = np.float32130            scaling = 1.0 / len(model.estimators_)  # output is average of trees131            # self.scaling = scaling132            self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in133                          model.estimators_]134            self.objective = objective_name_map.get(model.criterion, None)135            self.tree_output = "raw_value"136        elif safe_isinstance(model, ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor",137                                     "econml.grf._base_grftree.GRFTree"]):138            self.internal_dtype = model.tree_.value.dtype.type139            self.input_dtype = np.float32140            self.trees = [SingleTree(model.tree_, data=data, data_missing=data_missing)]141            self.objective = objective_name_map.get(model.criterion, None)142            self.tree_output = "raw_value"143            # self.scaling = 1144        elif safe_isinstance(model,145                             ["sklearn.tree.DecisionTreeClassifier", "sklearn.tree.tree.DecisionTreeClassifier"]):146            self.internal_dtype = model.tree_.value.dtype.type147            self.input_dtype = np.float32148            self.trees = [SingleTree(model.tree_, normalize=True, data=data, data_missing=data_missing)]149            self.objective = objective_name_map.get(model.criterion, None)150            self.tree_output = "probability"151            # self.scaling = 1152        elif safe_isinstance(model, ["sklearn.ensemble.RandomForestClassifier",153                                     "sklearn.ensemble.forest.RandomForestClassifier"]):154            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"155            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type156            self.input_dtype = np.float32157            scaling = 1.0 / len(model.estimators_)  # output is average of trees158            # self.scaling = scaling159            self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for160                          e in model.estimators_]161            self.objective = objective_name_map.get(model.criterion, None)162            self.tree_output = "probability"163        elif safe_isinstance(model, ["sklearn.ensemble.ExtraTreesClassifier",164                                     "sklearn.ensemble.forest.ExtraTreesClassifier"]):  # TODO: add unit test for this case165            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"166            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type167            self.input_dtype = np.float32168            scaling = 1.0 / len(model.estimators_)  # output is average of trees169            # self.scaling = scaling170            self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for171                          e in model.estimators_]172            self.objective = objective_name_map.get(model.criterion, None)173            self.tree_output = "probability"174        elif safe_isinstance(model, ["sklearn.ensemble.GradientBoostingRegressor",175                                     "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"]):176            self.input_dtype = np.float32177            # currently we only support the mean and quantile estimators178            if safe_isinstance(model.init_,179                               ["sklearn.ensemble.MeanEstimator", "sklearn.ensemble.gradient_boosting.MeanEstimator"]):180                self.base_offset = model.init_.mean181            elif safe_isinstance(model.init_, ["sklearn.ensemble.QuantileEstimator",182                                               "sklearn.ensemble.gradient_boosting.QuantileEstimator"]):183                self.base_offset = model.init_.quantile184            elif safe_isinstance(model.init_, "sklearn.dummy.DummyRegressor"):185                self.base_offset = model.init_.constant_[0]186            else:187                assert False, "Unsupported init model type: " + str(type(model.init_))188            self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e189                          in model.estimators_[:, 0]]190            # self.scaling = model.learning_rate191            self.objective = objective_name_map.get(model.criterion, None)192            self.tree_output = "raw_value"193        elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingRegressor"]):194            import sklearn195            if self.model_output == "predict":196                self.model_output = "raw"197            self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE198            self.base_offset = model._baseline_prediction199            self.trees = []200            for p in model._predictors:201                nodes = p[0].nodes202                # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold')203                tree = {204                    "children_left": np.array([-1 if n[9] else n[5] for n in nodes]),205                    "children_right": np.array([-1 if n[9] else n[6] for n in nodes]),206                    "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]),207                    "features": np.array([-2 if n[9] else n[2] for n in nodes]),208                    "thresholds": np.array([n[3] for n in nodes], dtype=np.float64),209                    "values": np.array([[n[0]] for n in nodes], dtype=np.float64),210                    "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64),211                }212                self.trees.append(SingleTree(tree, data=data, data_missing=data_missing))213            self.objective = objective_name_map.get(model.loss, None)214            self.tree_output = "raw_value"215            # self.scaling = 1216        elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingClassifier"]):217            import sklearn218            self.base_offset = model._baseline_prediction219            if hasattr(self.base_offset, "__len__") and self.model_output != "raw":220                raise Exception(221                    "Multi-output HistGradientBoostingClassifier models are not yet supported unless model_output=\"raw\". See GitHub issue #1028")222            self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE223            self.num_stacked_models = len(model._predictors[0])224            if self.model_output == "predict_proba":225                if self.num_stacked_models == 1:226                    self.model_output = "probability_doubled"  # with predict_proba we need to double the outputs to match227                else:228                    self.model_output = "probability"229            self.trees = []230            for p in model._predictors:231                for i in range(self.num_stacked_models):232                    nodes = p[i].nodes233                    # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold')234                    tree = {235                        "children_left": np.array([-1 if n[9] else n[5] for n in nodes]),236                        "children_right": np.array([-1 if n[9] else n[6] for n in nodes]),237                        "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]),238                        "features": np.array([-2 if n[9] else n[2] for n in nodes]),239                        "thresholds": np.array([n[3] for n in nodes], dtype=np.float64),240                        "values": np.array([[n[0]] for n in nodes], dtype=np.float64),241                        "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64),242                    }243                    self.trees.append(SingleTree(tree, data=data, data_missing=data_missing))244            self.objective = objective_name_map.get(model.loss, None)245            self.tree_output = "log_odds"246            # self.scaling = 1247        elif safe_isinstance(model, ["sklearn.ensemble.GradientBoostingClassifier",248                                     "sklearn.ensemble._gb.GradientBoostingClassifier",249                                     "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"]):250            self.input_dtype = np.float32251            # TODO: deal with estimators for each class252            if model.estimators_.shape[1] > 1:253                assert False, "GradientBoostingClassifier is only supported for binary classification right now!"254            # currently we only support the logs odds estimator255            if safe_isinstance(model.init_, ["sklearn.ensemble.LogOddsEstimator",256                                             "sklearn.ensemble.gradient_boosting.LogOddsEstimator"]):257                self.base_offset = model.init_.prior258                self.tree_output = "log_odds"259            elif safe_isinstance(model.init_, "sklearn.dummy.DummyClassifier"):260                self.base_offset = scipy.special.logit(model.init_.class_prior_[261                                                           1])  # with two classes the trees only model the second class. # pylint: disable=no-member262                self.tree_output = "log_odds"263            else:264                assert False, "Unsupported init model type: " + str(type(model.init_))265            self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e266                          in model.estimators_[:, 0]]267            # self.scaling = model.learning_rate268            self.objective = objective_name_map.get(model.criterion, None)269        elif "pyspark.ml" in str(type(model)):270            assert_import("pyspark")271            self.model_type = "pyspark"272            # model._java_obj.getImpurity() can be gini, entropy or variance.273            self.objective = objective_name_map.get(model._java_obj.getImpurity(), None)274            if "Classification" in str(type(model)):275                normalize = True276                self.tree_output = "probability"277            else:278                normalize = False279                self.tree_output = "raw_value"280            # Spark Random forest, create 1 weighted (avg) tree per sub-model281            if safe_isinstance(model, "pyspark.ml.classification.RandomForestClassificationModel") \282                    or safe_isinstance(model, "pyspark.ml.regression.RandomForestRegressionModel"):283                sum_weight = sum(model.treeWeights)  # output is average of trees284                self.trees = [SingleTree(tree, normalize=normalize, scaling=model.treeWeights[i] / sum_weight) for285                              i, tree in enumerate(model.trees)]286                # self.scaling = model.treeWeights[i] / sum_weight287                # Spark GBT, create 1 weighted (learning rate) tree per sub-model288            elif safe_isinstance(model, "pyspark.ml.classification.GBTClassificationModel") \289                    or safe_isinstance(model, "pyspark.ml.regression.GBTRegressionModel"):290                self.objective = "squared_error"  # GBT subtree use the variance291                self.tree_output = "raw_value"292                self.trees = [SingleTree(tree, normalize=False, scaling=model.treeWeights[i]) for i, tree in293                              enumerate(model.trees)]294            # Spark Basic model (single tree)295            elif safe_isinstance(model, "pyspark.ml.classification.DecisionTreeClassificationModel") \296                    or safe_isinstance(model, "pyspark.ml.regression.DecisionTreeRegressionModel"):297                self.trees = [SingleTree(model, normalize=normalize, scaling=1)]298            else:299                assert False, "Unsupported Spark model type: " + str(type(model))300        elif safe_isinstance(model, "xgboost.core.Booster"):301            import xgboost302            self.original_model = model303            self.model_type = "xgboost"304            xgb_loader = XGBTreeModelLoader(self.original_model)305            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)306            self.base_offset = xgb_loader.base_score307            self.objective = objective_name_map.get(xgb_loader.name_obj, None)308            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)309            if xgb_loader.num_class > 0:310                self.num_stacked_models = xgb_loader.num_class311            # self.scaling = 1312        elif safe_isinstance(model, "xgboost.sklearn.XGBClassifier"):313            import xgboost314            self.input_dtype = np.float32315            self.model_type = "xgboost"316            self.original_model = model.get_booster()317            xgb_loader = XGBTreeModelLoader(self.original_model)318            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)319            self.base_offset = xgb_loader.base_score320            self.objective = objective_name_map.get(xgb_loader.name_obj, None)321            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)322            self.tree_limit = getattr(model, "best_ntree_limit", None)323            if xgb_loader.num_class > 0:324                self.num_stacked_models = xgb_loader.num_class325            if self.model_output == "predict_proba":326                if self.num_stacked_models == 1:327                    self.model_output = "probability_doubled"  # with predict_proba we need to double the outputs to match328                else:329                    self.model_output = "probability"330            # self.scaling = 1331        elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"):332            import xgboost333            self.original_model = model.get_booster()334            self.model_type = "xgboost"335            xgb_loader = XGBTreeModelLoader(self.original_model)336            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)337            self.base_offset = xgb_loader.base_score338            self.objective = objective_name_map.get(xgb_loader.name_obj, None)339            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)340            self.tree_limit = getattr(model, "best_ntree_limit", None)341            if xgb_loader.num_class > 0:342                self.num_stacked_models = xgb_loader.num_class343            # self.scaling = 1344        elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"):345            import xgboost346            self.original_model = model.get_booster()347            self.model_type = "xgboost"348            xgb_loader = XGBTreeModelLoader(self.original_model)349            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)350            self.base_offset = xgb_loader.base_score351            # Note: for ranker, leaving tree_output and objective as None as they352            # are not implemented in native code yet353            self.tree_limit = getattr(model, "best_ntree_limit", None)354            if xgb_loader.num_class > 0:355                self.num_stacked_models = xgb_loader.num_class356            # self.scaling = 1357        elif safe_isinstance(model, "lightgbm.basic.Booster"):358            assert_import("lightgbm")359            self.model_type = "lightgbm"360            self.original_model = model361            tree_info = self.original_model.dump_model()["tree_info"]362            try:363                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]364            except:365                self.trees = None  # we get here because the cext can't handle categorical splits yet366            self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)367            self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)368            # self.scaling = 1369        elif safe_isinstance(model, "gpboost.basic.Booster"):370            assert_import("gpboost")371            self.model_type = "gpboost"372            self.original_model = model373            tree_info = self.original_model.dump_model()["tree_info"]374            try:375                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]376            except:377                self.trees = None  # we get here because the cext can't handle categorical splits yet378            self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)379            self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)380            # self.scaling = 1381        elif safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor"):382            assert_import("lightgbm")383            self.model_type = "lightgbm"384            self.original_model = model.booster_385            tree_info = self.original_model.dump_model()["tree_info"]386            try:387                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]388            except:389                self.trees = None  # we get here because the cext can't handle categorical splits yet390            self.objective = objective_name_map.get(model.objective, None)391            self.tree_output = tree_output_name_map.get(model.objective, None)392            if model.objective is None:393                self.objective = "squared_error"394                self.tree_output = "raw_value"395            # self.scaling = 1396        elif safe_isinstance(model, "lightgbm.sklearn.LGBMRanker"):397            assert_import("lightgbm")398            self.model_type = "lightgbm"399            self.original_model = model.booster_400            tree_info = self.original_model.dump_model()["tree_info"]401            try:402                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]403            except:404                self.trees = None  # we get here because the cext can't handle categorical splits yet405            # Note: for ranker, leaving tree_output and objective as None as they406            # are not implemented in native code yet407            # self.scaling = 1408        elif safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier"):409            assert_import("lightgbm")410            self.model_type = "lightgbm"411            if model.n_classes_ > 2:412                self.num_stacked_models = model.n_classes_413            self.original_model = model.booster_414            tree_info = self.original_model.dump_model()["tree_info"]415            try:416                self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]417            except:418                self.trees = None  # we get here because the cext can't handle categorical splits yet419            self.objective = objective_name_map.get(model.objective, None)420            self.tree_output = tree_output_name_map.get(model.objective, None)421            if model.objective is None:422                self.objective = "binary_crossentropy"423                self.tree_output = "log_odds"424            # self.scaling = 1425        elif safe_isinstance(model, "catboost.core.CatBoostRegressor"):426            assert_import("catboost")427            self.model_type = "catboost"428            self.original_model = model429            self.cat_feature_indices = model.get_cat_feature_indices()430            # self.scaling = 1431        elif safe_isinstance(model, "catboost.core.CatBoostClassifier"):432            assert_import("catboost")433            self.model_type = "catboost"434            self.original_model = model435            self.input_dtype = np.float32436            try:437                cb_loader = CatBoostTreeModelLoader(model)438                self.trees = cb_loader.get_trees(data=data, data_missing=data_missing)439            except:440                self.trees = None  # we get here because the cext can't handle categorical splits yet441            self.tree_output = "log_odds"442            self.objective = "binary_crossentropy"443            self.cat_feature_indices = model.get_cat_feature_indices()444            # self.scaling = 1445        elif safe_isinstance(model, "catboost.core.CatBoost"):446            assert_import("catboost")447            self.model_type = "catboost"448            self.original_model = model449            self.cat_feature_indices = model.get_cat_feature_indices()450            # self.scaling = 1451        elif safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"):452            self.input_dtype = np.float32453            scaling = 1.0 / len(model.estimators_)  # output is average of trees454            self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for455                          e in model.estimators_]456            self.objective = objective_name_map.get(model.criterion, None)457            self.tree_output = "probability"458            # self.scaling = 1459        elif safe_isinstance(model, "ngboost.ngboost.NGBoost") or safe_isinstance(model,460                                                                                  "ngboost.api.NGBRegressor") or safe_isinstance(461            model, "ngboost.api.NGBClassifier"):462            assert model.base_models, "The NGBoost model has empty `base_models`! Have you called `model.fit`?"463            if self.model_output == "raw":464                param_idx = 0  # default to the first parameter of the output distribution465                warnings.warn(466                    "Translating model_ouput=\"raw\" to model_output=0 for the 0-th parameter in the distribution. Use model_output=0 directly to avoid this warning.")467            elif type(self.model_output) is int:468                param_idx = self.model_output469                self.model_output = "raw"  # note that after loading we have a new model_output type470            assert safe_isinstance(model.base_models[0][param_idx], ["sklearn.tree.DecisionTreeRegressor",471                                                                     "sklearn.tree.tree.DecisionTreeRegressor"]), "You must use default_tree_learner!"472            shap_trees = [trees[param_idx] for trees in model.base_models]473            self.internal_dtype = shap_trees[0].tree_.value.dtype.type474            self.input_dtype = np.float32475            scaling = - model.learning_rate * np.array(model.scalings)  # output is weighted average of trees476            self.trees = [SingleTree(e.tree_, scaling=s, data=data, data_missing=data_missing) for e, s in477                          zip(shap_trees, scaling)]478            self.objective = objective_name_map.get(shap_trees[0].criterion, None)479            self.tree_output = "raw_value"480            self.base_offset = model.init_params[param_idx]481            # self.scaling = 1482        else:483            raise Exception("Model type not yet supported by TreeExplainer: " + str(type(model)))484        # build a dense numpy version of all the tree objects485        if self.trees is not None and self.trees:486            max_nodes = np.max([len(t.values) for t in self.trees])487            assert len(np.unique([t.values.shape[1] for t in488                                  self.trees])) == 1, "All trees in the ensemble must have the same output dimension!"489            num_trees = len(self.trees)490            if self.num_stacked_models > 1:491                assert len(492                    self.trees) % self.num_stacked_models == 0, "Only stacked models with equal numbers of trees are supported!"493                assert self.trees[0].values.shape[494                           1] == 1, "Only stacked models with single outputs per model are supported!"495                self.num_outputs = self.num_stacked_models496            else:497                self.num_outputs = self.trees[0].values.shape[1]498            if safe_isinstance(model, ["xgboost.sklearn.XGBClassifier",499                                       "catboost.core.CatBoostClassifier", "lightgbm.sklearn.LGBMClassifier"]) and \500                    self.num_outputs == 1:501                self.values_binary = np.zeros((num_trees, max_nodes, 2), dtype=self.internal_dtype)502                for i in range(num_trees):503                    # y = self.model.predict(self.data)504                    # self.trees[i].values = np.zeros((max_nodes, self.num_outputs))505                    # rebuild_acvtree(0, self.trees[i], self.data, y)506                    # self.trees[i].values = self.trees[i].scaling * self.trees[i].values507                    # p = np.exp(self.trees[i].values)/(1+np.exp(self.trees[i].values))508                    p = 1/(1+np.exp(-self.trees[i].values))509                    self.values_binary[i, :len(self.trees[i].values)] = np.concatenate([1-p, p], axis=1)/num_trees510            # important to be -1 in unused sections!! This way we can tell which entries are valid.511            self.children_left = -np.ones((num_trees, max_nodes), dtype=np.int32)512            self.children_right = -np.ones((num_trees, max_nodes), dtype=np.int32)513            self.children_default = -np.ones((num_trees, max_nodes), dtype=np.int32)514            self.features = -np.ones((num_trees, max_nodes), dtype=np.int32)515            self.thresholds = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype)516            self.values = np.zeros((num_trees, max_nodes, self.num_outputs), dtype=self.internal_dtype)517            self.node_sample_weight = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype)518            self.partition_leaves_trees = []519            self.node_idx_trees = []520            self.data_leaves_trees = []521            self.leaf_idx_trees = []522            self.leaves_nb = []523            self.scalings = []524            for i in tqdm(range(num_trees)):525                self.scalings.append(self.trees[i].scaling)526                self.children_left[i, :len(self.trees[i].children_left)] = self.trees[i].children_left527                self.children_right[i, :len(self.trees[i].children_right)] = self.trees[i].children_right528                self.children_default[i, :len(self.trees[i].children_default)] = self.trees[i].children_default529                self.features[i, :len(self.trees[i].features)] = self.trees[i].features530                self.thresholds[i, :len(self.trees[i].thresholds)] = self.trees[i].thresholds531                if self.num_stacked_models > 1:532                    # stack_pos = int(i // (num_trees / self.num_stacked_models))533                    stack_pos = i % self.num_stacked_models534                    self.values[i, :len(self.trees[i].values[:, 0]), stack_pos] = self.trees[i].values[:, 0]535                else:536                    self.values[i, :len(self.trees[i].values)] = self.trees[i].values537                self.node_sample_weight[i, :len(self.trees[i].node_sample_weight)] = self.trees[i].node_sample_weight538                # ensure that the passed background dataset lands in every leaf539                if np.min(self.trees[i].node_sample_weight) <= 0:540                    self.fully_defined_weighting = False541                self.leaf_idx = [idx for idx in range(len(self.trees[i].features))542                                 if self.trees[i].children_left[idx] < 0]543                self.leaves_nb.append(len(self.leaf_idx))544                self.partition_leaves = []545                self.node_idx = []546                self.max_var = []547                self.data_leaves = []548                for leaf_id in self.leaf_idx:549                    node_id = [-1]550                    partition_leaf = [np.array([[-np.inf, np.inf]]) for idx2 in range(self.data.shape[1])]551                    _ = get_partition(leaf_id, partition_leaf, node_id, self.trees[i].children_left,552                                      self.trees[i].children_right, self.trees[i].features, self.trees[i].thresholds)553                    self.partition_leaves.append(np.squeeze(np.array(partition_leaf)))554                    self.node_idx.append(list(set(node_id[1:])))555                    self.max_var.append(len(self.node_idx[-1]))556                    # self.data_leaves.append(np.array([(self.data[:, s] <= self.partition_leaves[-1][s, 1]) * \557                    #                                       (self.data[:, s] > self.partition_leaves[-1][s, 0])558                    #                                       for s in range(self.data.shape[1])], dtype=np.int).transpose())559                self.partition_leaves_trees.append(self.partition_leaves)560                # self.data_leaves_trees.append(self.data_leaves)561                self.node_idx_trees.append(self.node_idx)562                self.leaf_idx_trees.append(self.leaf_idx)563            leaf_idx_trees = -np.ones(shape=(len(self.leaves_nb), np.max(self.leaves_nb)), dtype=np.int)564            partition_leaves_trees = -np.ones(shape=(len(self.leaves_nb), np.max(self.leaves_nb), self.data.shape[1], 2))565            # data_leaves_trees = -np.ones(shape=(len(self.leaves_nb), np.max(self.leaves_nb), self.data.shape[0], self.data.shape[1]), dtype=np.int)566            for i in range(len(self.leaves_nb)):567                leaf_idx_trees[i, :self.leaves_nb[i]] = np.array(self.leaf_idx_trees[i], dtype=np.int)568                partition_leaves_trees[i, :self.leaves_nb[i]] = np.array(self.partition_leaves_trees[i])569                # data_leaves_trees[i, :self.leaves_nb[i]] = np.array(self.data_leaves_trees[i], dtype=np.int)570            self.leaf_idx_trees = leaf_idx_trees571            self.partition_leaves_trees = partition_leaves_trees572            self.leaves_nb = np.array(self.leaves_nb, dtype=np.int)573            self.scalings = np.array(self.scalings, dtype=np.float)574            self.data = np.array(self.data, dtype=np.float)575            self.max_var = np.max(self.max_var)576            # self.data_leaves_trees = data_leaves_trees577            # if safe_isinstance(model, ["xgboost.sklearn.XGBClassifier",578            #                            "catboost.core.CatBoostClassifier", "lightgbm.sklearn.LGBMClassifier"]) and \579            #         self.num_outputs == 1:580            #     p = np.exp(self.values)/(1 + np.exp(self.values))581            #     print(np.max(p), np.min(1-p))582            #     self.values = np.concatenate([1-p, p], axis=2)583            #     self.num_outputs = 2584            self.num_nodes = np.array([len(t.values) for t in self.trees], dtype=np.int32)585            self.max_depth = np.max([t.max_depth for t in self.trees])586            if self.cache:587                if self.multi_threads:588                    self.lm, self.lm_s, self.lm_si = self.leaves_cache(C=self.C)589                else:590                    self.lm, self.lm_s, self.lm_si = self.leaves_cache_nopa(C=self.C)591                if self.cache_normalized:592                    if self.multi_threads:593                        self.lm_n, self.lm_s_n, self.lm_si_n = self.leaves_cache_normalized(C=self.C)594                    else:595                        self.lm_n, self.lm_s_n, self.lm_si_n = self.leaves_cache_normalized_nopa(C=self.C)596            # make sure the base offset is a 1D array597            if not hasattr(self.base_offset, "__len__") or len(self.base_offset) == 0:598                self.base_offset = (np.ones(self.num_outputs) * self.base_offset).astype(self.internal_dtype)599            self.base_offset = self.base_offset.flatten()600            assert len(self.base_offset) == self.num_outputs601    @abstractmethod602    def compute_cond_exp(self, X, S, data):603        pass604    @abstractmethod605    def shap_values(self, x, C):606        pass607    @abstractmethod608    def shap_values_acv(self, x, C, S_star, N_star):609        pass610    @abstractmethod611    def compute_sdp_clf(self, X, tX, S, data):612        pass613    @abstractmethod614    def compute_sdp_reg(self, X, tX, S, data):615        pass616    @abstractmethod617    def compute_local_sdp_clf(self, x, threshold, proba, index, data, final_coal, decay, C, verbose):618        pass619    @abstractmethod620    def compute_local_sdp_reg(self, x, threshold, proba, index, data, final_coal, decay, C, verbose):621        pass622    @abstractmethod623    def swing_values_clf(self, x,  tx, S, data, threshold):624        pass625    @abstractmethod626    def swing_values_reg(self, x,  tx, S, data, threshold):627        pass628    @abstractmethod629    def shap_values_swing_clf(self, x, tx, data, threshold, C):630        pass631    @abstractmethod632    def shap_values_swing_reg(self, x, tx, data, threshold, C):633        pass634    @abstractmethod635    def global_sdp_importance_clf(self, data, data_bground, columns_names, global_proba, decay, threshold,636                                  proba, C, verbose):637        pass638    @abstractmethod639    def global_sdp_importance_reg(self, data, data_bground, columns_names, global_proba, decay, threshold,640                                  proba, C, verbose):641        pass642    def predict(self, X, y=None, output=None, tree_limit=None):643        """ A consistent interface to make predictions from this model.644        Parameters645        ----------646        tree_limit : None (default) or int647            Limit the number of trees used by the model. By default None means no use the limit of the648            original model, and -1 means no limit.649        """650        if output is None:651            output = self.model_output652        if self.model_type == "pyspark":653            # import pyspark654            # TODO: support predict for pyspark655            raise NotImplementedError(656                "Predict with pyspark isn't implemented. Don't run 'interventional' as feature_perturbation.")657        # see if we have a default tree_limit in place.658        if tree_limit is None:659            tree_limit = -1 if self.tree_limit is None else self.tree_limit660        # convert dataframes661        if safe_isinstance(X, "pandas.core.series.Series"):662            X = X.values663        elif safe_isinstance(X, "pandas.core.frame.DataFrame"):664            X = X.values665        flat_output = False666        if len(X.shape) == 1:667            flat_output = True668            X = X.reshape(1, X.shape[0])669        if X.dtype.type != self.input_dtype:670            X = X.astype(self.input_dtype)671        X_missing = np.isnan(X, dtype=np.bool)672        assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))673        assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"674        if tree_limit < 0 or tree_limit > self.values.shape[0]:675            tree_limit = self.values.shape[0]676        if output == "logloss":677            assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"678            assert X.shape[0] == len(679                y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (680                len(y), X.shape[0])681        #         transform = self.get_transform()682        assert_import("cext_acv")683        output = np.zeros((X.shape[0], self.num_outputs))684        cext_acv.dense_tree_predict(685            self.children_left, self.children_right, self.children_default,686            self.features, self.thresholds, self.values,687            self.max_depth, tree_limit, self.base_offset,688            X, X_missing, output)689        # drop dimensions we don't need690        if flat_output:691            if self.num_outputs == 1:692                return output.flatten()[0]693            else:694                return output.reshape(-1, self.num_outputs)695        else:696            if self.num_outputs == 1:697                return output.flatten()698            else:699                return output700    # def single_predict(self, X, y=None, output=None, tree_limit=None, i=0):701    #     """ A consistent interface to make predictions from this model.702    #703    #     Parameters704    #     ----------705    #     tree_limit : None (default) or int706    #         Limit the number of trees used by the model. By default None means no use the limit of the707    #         original model, and -1 means no limit.708    #     """709    #710    #     if output is None:711    #         output = self.model_output712    #713    #     if self.model_type == "pyspark":714    #         # import pyspark715    #         # TODO: support predict for pyspark716    #         raise NotImplementedError(717    #             "Predict with pyspark isn't implemented. Don't run 'interventional' as feature_perturbation.")718    #719    #     # see if we have a default tree_limit in place.720    #     if tree_limit is None:721    #         tree_limit = -1 if self.tree_limit is None else self.tree_limit722    #723    #     # convert dataframes724    #     if safe_isinstance(X, "pandas.core.series.Series"):725    #         X = X.values726    #     elif safe_isinstance(X, "pandas.core.frame.DataFrame"):727    #         X = X.values728    #     flat_output = False729    #     if len(X.shape) == 1:730    #         flat_output = True731    #         X = X.reshape(1, X.shape[0])732    #     if X.dtype.type != self.input_dtype:733    #         X = X.astype(self.input_dtype)734    #     X_missing = np.isnan(X, dtype=np.bool)735    #     assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))736    #     assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"737    #738    #     if tree_limit < 0 or tree_limit > self.values.shape[0]:739    #         tree_limit = self.values.shape[0]740    #741    #     if output == "logloss":742    #         assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"743    #         assert X.shape[0] == len(744    #             y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (745    #             len(y), X.shape[0])746    #     #         transform = self.get_transform()747    #     assert_import("cext_acv")748    #     output = np.zeros((X.shape[0], self.num_outputs))749    #     cext_acv.single_tree_predict(750    #         self.children_left, self.children_right, self.children_default,751    #         self.features, self.thresholds, self.values,752    #         self.max_depth, tree_limit, self.base_offset,753    #         X, X_missing, output, i)754    #755    #     # drop dimensions we don't need756    #     if flat_output:757    #         if self.num_outputs == 1:758    #             return output.flatten()[0]/self.scaling759    #         else:760    #             return output.reshape(-1, self.num_outputs)/self.scaling761    #     else:762    #         if self.num_outputs == 1:763    #             return output.flatten()/self.scaling764    #         else:765    #             return output/self.scaling766    # def shap_values(self, x, C=[[]]):767    #     out = np.zeros((x.shape[0], x.shape[1], self.num_outputs))768    #     for i in range(len(self.trees)):769    #         out += shap_values_leaves(x, self.partition_leaves_trees[i], self.data_leaves_trees[i], self.node_idx_trees[i],770    #                                   self.leaf_idx_trees[i], self.node_sample_weight[i], self.values[i], C, self.num_outputs)771    #     return out772    #773    # def shap_values_acv(self, x, C=[[]]):774    #     out = np.zeros((x.shape[0], x.shape[1], self.num_outputs))775    #     for i in range(len(self.trees)):776    #         out += shap_values_leaves(x, self.partition_leaves_trees[i], self.data_leaves_trees[i], self.node_idx_trees[i],777    #                                   self.leaf_idx_trees[i], self.node_sample_weight[i], self.values[i], C, self.num_outputs)778    #     return out779class SingleTree:780    """ A single decision tree.781    The primary point of this object is to parse many different tree types into a common format.782    """783    def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None):784        self.scaling = scaling785        if safe_isinstance(tree, ["sklearn.tree._tree.Tree", "econml.tree._tree.Tree"]):786            self.children_left = tree.children_left.astype(np.int32)787            self.children_right = tree.children_right.astype(np.int32)788            self.children_default = self.children_left  # missing values not supported in sklearn789            self.features = tree.feature.astype(np.int32)790            self.thresholds = tree.threshold.astype(np.float64)791            self.values = tree.value.reshape(tree.value.shape[0], tree.value.shape[1] * tree.value.shape[2])792            if normalize:793                self.values = (self.values.T / self.values.sum(1)).T794            self.values = self.values * scaling795            self.node_sample_weight = tree.weighted_n_node_samples.astype(np.float64)796        elif type(tree) is dict and 'features' in tree:797            self.children_left = tree["children_left"].astype(np.int32)798            self.children_right = tree["children_right"].astype(np.int32)799            self.children_default = tree["children_default"].astype(np.int32)800            self.features = tree["features"].astype(np.int32)801            self.thresholds = tree["thresholds"]802            self.values = tree["values"] * scaling803            self.node_sample_weight = tree["node_sample_weight"]804        # deprecated dictionary support (with sklearn singlular style "feature" and "value" names)805        elif type(tree) is dict and 'children_left' in tree:806            self.children_left = tree["children_left"].astype(np.int32)807            self.children_right = tree["children_right"].astype(np.int32)808            self.children_default = tree["children_default"].astype(np.int32)809            self.features = tree["feature"].astype(np.int32)810            self.thresholds = tree["threshold"]811            self.values = tree["value"] * scaling812            self.node_sample_weight = tree["node_sample_weight"]813        elif safe_isinstance(tree, "pyspark.ml.classification.DecisionTreeClassificationModel") \814                or safe_isinstance(tree, "pyspark.ml.regression.DecisionTreeRegressionModel"):815            # model._java_obj.numNodes() doesn't give leaves, need to recompute the size816            def getNumNodes(node, size):817                size = size + 1818                if node.subtreeDepth() == 0:819                    return size820                else:821                    size = getNumNodes(node.leftChild(), size)822                    return getNumNodes(node.rightChild(), size)823            num_nodes = getNumNodes(tree._java_obj.rootNode(), 0)824            self.children_left = np.full(num_nodes, -2, dtype=np.int32)825            self.children_right = np.full(num_nodes, -2, dtype=np.int32)826            self.children_default = np.full(num_nodes, -2, dtype=np.int32)827            self.features = np.full(num_nodes, -2, dtype=np.int32)828            self.thresholds = np.full(num_nodes, -2, dtype=np.float64)829            self.values = [-2] * num_nodes830            self.node_sample_weight = np.full(num_nodes, -2, dtype=np.float64)831            def buildTree(index, node):832                index = index + 1833                if tree._java_obj.getImpurity() == 'variance':834                    self.values[index] = [node.prediction()]  # prediction for the node835                else:836                    self.values[index] = [e for e in837                                          node.impurityStats().stats()]  # for gini: NDarray(numLabel): 1 per label: number of item for each label which went through this node838                self.node_sample_weight[839                    index] = node.impurityStats().count()  # weighted count of element trough this node840                if node.subtreeDepth() == 0:841                    return index842                else:843                    self.features[844                        index] = node.split().featureIndex()  # index of the feature we split on, not available for leaf, int845                    if str(node.split().getClass()).endswith('tree.CategoricalSplit'):846                        # Categorical split isn't implemented, TODO: could fake it by creating a fake node to split on the exact value?847                        raise NotImplementedError('CategoricalSplit are not yet implemented')848                    self.thresholds[849                        index] = node.split().threshold()  # threshold for the feature, not available for leaf, float850                    self.children_left[index] = index + 1851                    idx = buildTree(index, node.leftChild())852                    self.children_right[index] = idx + 1853                    idx = buildTree(idx, node.rightChild())854                    return idx855            buildTree(-1, tree._java_obj.rootNode())856            # default Not supported with mlib? (TODO)857            self.children_default = self.children_left858            self.values = np.asarray(self.values)859            if normalize:860                self.values = (self.values.T / self.values.sum(1)).T861            self.values = self.values * scaling862        elif type(tree) == dict and 'tree_structure' in tree:  # LightGBM model dump863            start = tree['tree_structure']864            num_parents = tree['num_leaves'] - 1865            self.children_left = np.empty((2 * num_parents + 1), dtype=np.int32)866            self.children_right = np.empty((2 * num_parents + 1), dtype=np.int32)867            self.children_default = np.empty((2 * num_parents + 1), dtype=np.int32)868            self.features = np.empty((2 * num_parents + 1), dtype=np.int32)869            self.thresholds = np.empty((2 * num_parents + 1), dtype=np.float64)870            self.values = [-2] * (2 * num_parents + 1)871            self.node_sample_weight = np.empty((2 * num_parents + 1), dtype=np.float64)872            visited, queue = [], [start]873            while queue:874                vertex = queue.pop(0)875                if 'split_index' in vertex.keys():876                    if vertex['split_index'] not in visited:877                        if 'split_index' in vertex['left_child'].keys():878                            self.children_left[vertex['split_index']] = vertex['left_child']['split_index']879                        else:880                            self.children_left[vertex['split_index']] = vertex['left_child']['leaf_index'] + num_parents881                        if 'split_index' in vertex['right_child'].keys():882                            self.children_right[vertex['split_index']] = vertex['right_child']['split_index']883                        else:884                            self.children_right[vertex['split_index']] = vertex['right_child'][885                                                                             'leaf_index'] + num_parents886                        if vertex['default_left']:887                            self.children_default[vertex['split_index']] = self.children_left[vertex['split_index']]888                        else:889                            self.children_default[vertex['split_index']] = self.children_right[vertex['split_index']]890                        self.features[vertex['split_index']] = vertex['split_feature']891                        self.thresholds[vertex['split_index']] = vertex['threshold']892                        self.values[vertex['split_index']] = [vertex['internal_value']]893                        self.node_sample_weight[vertex['split_index']] = vertex['internal_count']894                        visited.append(vertex['split_index'])895                        queue.append(vertex['left_child'])896                        queue.append(vertex['right_child'])897                else:898                    self.children_left[vertex['leaf_index'] + num_parents] = -1899                    self.children_right[vertex['leaf_index'] + num_parents] = -1900                    self.children_default[vertex['leaf_index'] + num_parents] = -1901                    self.features[vertex['leaf_index'] + num_parents] = -1902                    self.children_left[vertex['leaf_index'] + num_parents] = -1903                    self.children_right[vertex['leaf_index'] + num_parents] = -1904                    self.children_default[vertex['leaf_index'] + num_parents] = -1905                    self.features[vertex['leaf_index'] + num_parents] = -1906                    self.thresholds[vertex['leaf_index'] + num_parents] = -1907                    self.values[vertex['leaf_index'] + num_parents] = [vertex['leaf_value']]908                    self.node_sample_weight[vertex['leaf_index'] + num_parents] = vertex['leaf_count']909            self.values = np.asarray(self.values)910            self.values = np.multiply(self.values, scaling)911        elif type(tree) == dict and 'nodeid' in tree:912            """ Directly create tree given the JSON dump (with stats) of a XGBoost model.913            """914            def max_id(node):915                if "children" in node:916                    return max(node["nodeid"], *[max_id(n) for n in node["children"]])917                else:918                    return node["nodeid"]919            m = max_id(tree) + 1920            self.children_left = -np.ones(m, dtype=np.int32)921            self.children_right = -np.ones(m, dtype=np.int32)922            self.children_default = -np.ones(m, dtype=np.int32)923            self.features = -np.ones(m, dtype=np.int32)924            self.thresholds = np.zeros(m, dtype=np.float64)925            self.values = np.zeros((m, 1), dtype=np.float64)926            self.node_sample_weight = np.empty(m, dtype=np.float64)927            def extract_data(node, tree):928                i = node["nodeid"]929                tree.node_sample_weight[i] = node["cover"]930                if "children" in node:931                    tree.children_left[i] = node["yes"]932                    tree.children_right[i] = node["no"]933                    tree.children_default[i] = node["missing"]934                    tree.features[i] = node["split"]935                    tree.thresholds[i] = node["split_condition"]936                    for n in node["children"]:937                        extract_data(n, tree)938                elif "leaf" in node:939                    tree.values[i] = node["leaf"] * scaling940            extract_data(tree, self)941        elif type(tree) == str:942            """ Build a tree from a text dump (with stats) of xgboost.943            """944            nodes = [t.lstrip() for t in tree[:-1].split("\n")]945            nodes_dict = {}946            for n in nodes: nodes_dict[int(n.split(":")[0])] = n.split(":")[1]947            m = max(nodes_dict.keys()) + 1948            children_left = -1 * np.ones(m, dtype="int32")949            children_right = -1 * np.ones(m, dtype="int32")950            children_default = -1 * np.ones(m, dtype="int32")951            features = -2 * np.ones(m, dtype="int32")952            thresholds = -1 * np.ones(m, dtype="float64")953            values = 1 * np.ones(m, dtype="float64")954            node_sample_weight = np.zeros(m, dtype="float64")955            values_lst = list(nodes_dict.values())956            keys_lst = list(nodes_dict.keys())957            for i in range(0, len(keys_lst)):958                value = values_lst[i]959                key = keys_lst[i]960                if ("leaf" in value):961                    # Extract values962                    val = float(value.split("leaf=")[1].split(",")[0])963                    node_sample_weight_val = float(value.split("cover=")[1])964                    # Append to lists965                    values[key] = val966                    node_sample_weight[key] = node_sample_weight_val967                else:968                    c_left = int(value.split("yes=")[1].split(",")[0])969                    c_right = int(value.split("no=")[1].split(",")[0])970                    c_default = int(value.split("missing=")[1].split(",")[0])971                    feat_thres = value.split(" ")[0]972                    if ("<" in feat_thres):973                        feature = int(feat_thres.split("<")[0][2:])974                        threshold = float(feat_thres.split("<")[1][:-1])975                    if ("=" in feat_thres):976                        feature = int(feat_thres.split("=")[0][2:])977                        threshold = float(feat_thres.split("=")[1][:-1])978                    node_sample_weight_val = float(value.split("cover=")[1].split(",")[0])979                    children_left[key] = c_left980                    children_right[key] = c_right981                    children_default[key] = c_default982                    features[key] = feature983                    thresholds[key] = threshold984                    node_sample_weight[key] = node_sample_weight_val985            self.children_left = children_left986            self.children_right = children_right987            self.children_default = children_default988            self.features = features989            self.thresholds = thresholds990            self.values = values[:, np.newaxis] * scaling991            self.node_sample_weight = node_sample_weight992        else:993            raise Exception("Unknown input to SingleTree constructor: " + str(tree))994        # Re-compute the number of samples that pass through each node if we are given data995        # if data is not None and data_missing is not None:996        #     self.node_sample_weight[:] = 0.0997        #     cext_acv.dense_tree_update_weights(998        #         self.children_left, self.children_right, self.children_default, self.features,999        #         self.thresholds, self.values, 1, self.node_sample_weight, data, data_missing1000        #     )1001        # we compute the expectations to make sure they follow the SHAP logic1002        self.max_depth = cext_acv.compute_expectations(1003            self.children_left, self.children_right, self.node_sample_weight,1004            self.values1005        )1006    def predict(self, X):1007        # see if we have a default tree_limit in place.1008        children_left = np.expand_dims(self.children_left, 0)1009        children_right = np.expand_dims(self.children_right, 0)1010        children_default = np.expand_dims(self.children_default, 0)1011        features = np.expand_dims(self.features, 0)1012        thresholds = np.expand_dims(self.thresholds, 0)1013        values = np.expand_dims(self.values, 0) / self.scaling1014        # node_sample_weight = np.expand_dims(self.node_sample_weight, 0)1015        # convert dataframes1016        if safe_isinstance(X, "pandas.core.series.Series"):1017            X = X.values1018        elif safe_isinstance(X, "pandas.core.frame.DataFrame"):1019            X = X.values1020        flat_output = False1021        if len(X.shape) == 1:1022            flat_output = True1023            X = X.reshape(1, X.shape[0])1024        # if X.dtype.type != self.input_dtype:1025        #     X = X.astype(self.input_dtype)1026        X_missing = np.isnan(X, dtype=np.bool)1027        assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))1028        assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"1029        assert_import("cext_acv")1030        tree_limit = 11031        self.num_outputs = self.values.shape[1]1032        base_offset = np.zeros(self.num_outputs)1033        output = np.zeros((X.shape[0], self.num_outputs))1034        cext_acv.dense_tree_predict(1035            children_left, children_right, children_default, features, thresholds, values,1036            self.max_depth, tree_limit, base_offset, X, X_missing, output)1037        # drop dimensions we don't need1038        if flat_output:1039            if self.num_outputs == 1:1040                return output.flatten()[0]1041            else:1042                return output.reshape(-1, self.num_outputs)1043        else:1044            if self.num_outputs == 1:1045                return output.flatten()1046            else:1047                return output1048class IsoTree(SingleTree):1049    """1050    In sklearn the tree of the Isolation Forest does not calculated in a good way.1051    """1052    def __init__(self, tree, tree_features, normalize=False, scaling=1.0, data=None, data_missing=None):1053        super(IsoTree, self).__init__(tree, normalize, scaling, data, data_missing)1054        if safe_isinstance(tree, "sklearn.tree._tree.Tree"):1055            from sklearn.ensemble._iforest import _average_path_length  # pylint: disable=no-name-in-module1056            def _recalculate_value(tree, i, level):1057                if tree.children_left[i] == -1 and tree.children_right[i] == -1:1058                    value = level + _average_path_length(np.array([tree.n_node_samples[i]]))[0]1059                    self.values[i, 0] = value1060                    return value * tree.n_node_samples[i]1061                else:1062                    value_left = _recalculate_value(tree, tree.children_left[i], level + 1)1063                    value_right = _recalculate_value(tree, tree.children_right[i], level + 1)1064                    self.values[i, 0] = (value_left + value_right) / tree.n_node_samples[i]1065                    return value_left + value_right1066            _recalculate_value(tree, 0, 0)1067            if normalize:1068                self.values = (self.values.T / self.values.sum(1)).T...

tree.py

Source:tree.py

...80            raise Exception("feature_perturbation = \"independent\" is not a valid option value, please use " \81                "feature_perturbation == \"interventional\" instead. See GitHub issue #882.")82        83        84        if safe_isinstance(data, "pandas.core.frame.DataFrame"):85            self.data = data.values86        elif isinstance(data, DenseData):87            self.data = data.data88        else:89            self.data = data90        if self.data is None:91            feature_perturbation = "tree_path_dependent"92            warnings.warn("Setting feature_perturbation = \"tree_path_dependent\" because no background data was given.")93        elif feature_perturbation == "interventional" and self.data.shape[0] > 1000:94                warnings.warn("Passing "+str(self.data.shape[0]) + " background samples may lead to slow runtimes. Consider "95                    "using shap.sample(data, 100) to create a smaller background data set.")96        self.data_missing = None if self.data is None else np.isnan(self.data)97        self.model_output = model_output98        self.feature_perturbation = feature_perturbation99        self.expected_value = None100        self.model = TreeEnsemble(model, self.data, self.data_missing)101        assert feature_perturbation in feature_perturbation_codes, "Invalid feature_perturbation option!"102        # check for unsupported combinations of feature_perturbation and model_outputs103        if feature_perturbation == "tree_path_dependent":104            assert model_output == "margin", "Only margin model_output is supported for feature_perturbation=\"tree_path_dependent\""105        else:   106            assert data is not None, "A background dataset must be provided unless you are using feature_perturbation=\"tree_path_dependent\"!"107        if model_output != "margin":108            if self.model.objective is None and self.model.tree_output is None:109                raise Exception("Model does not have a known objective or output type! When model_output is " \110                                "not \"margin\" then we need to know the model's objective or link function.")111        # A bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs112        if safe_isinstance(model, "xgboost.sklearn.XGBClassifier") and model_output != "margin":113            import xgboost114            assert LooseVersion(xgboost.__version__) >= LooseVersion('0.81'), \115                "A bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs! Please upgrade to XGBoost >= v0.81!"116        117        # compute the expected value if we have a parsed tree for the cext118        if self.model_output == "logloss":119            self.expected_value = self.__dynamic_expected_value120        elif data is not None:121            self.expected_value = self.model.predict(self.data, output=model_output).mean(0)122            if hasattr(self.expected_value, '__len__') and len(self.expected_value) == 1:123                self.expected_value = self.expected_value[0]124        elif hasattr(self.model, "node_sample_weight"):125            self.expected_value = self.model.values[:,0].sum(0)126            if self.expected_value.size == 1:127                self.expected_value = self.expected_value[0]128            self.expected_value += self.model.base_offset129    def __dynamic_expected_value(self, y):130        """ This computes the expected value conditioned on the given label value.131        """132        return self.model.predict(self.data, np.ones(self.data.shape[0]) * y, output=self.model_output).mean(0)133        134    def shap_values(self, X, y=None, tree_limit=None, approximate=False, check_additivity=True):135        """ Estimate the SHAP values for a set of samples.136        Parameters137        ----------138        X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)139            A matrix of samples (# samples x # features) on which to explain the model's output.140        y : numpy.array141            An array of label values for each sample. Used when explaining loss functions.142        tree_limit : None (default) or int 143            Limit the number of trees used by the model. By default None means no use the limit of the144            original model, and -1 means no limit.145        approximate : bool146            Run fast, but only roughly approximate the Tree SHAP values. This runs a method147            previously proposed by Saabas which only considers a single feature ordering. Take care148            since this does not have the consistency guarantees of Shapley values and places too149            much weight on lower splits in the tree.150        check_additivity : bool151            Run a validation check that the sum of the SHAP values equals the output of the model. This152            check takes only a small amount of time, and will catch potential unforeseen errors.153            Note that this check only runs right now when explaining the margin of the model.154        Returns155        -------156        For models with a single output this returns a matrix of SHAP values157        (# samples x # features). Each row sums to the difference between the model output for that158        sample and the expected value of the model output (which is stored in the expected_value159        attribute of the explainer when it is constant). For models with vector outputs this returns160        a list of such matrices, one for each output.161        """162        # see if we have a default tree_limit in place.163        if tree_limit is None:164            tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit165        # shortcut using the C++ version of Tree SHAP in XGBoost, LightGBM, and CatBoost166        if self.feature_perturbation == "tree_path_dependent" and self.model.model_type != "internal" and self.data is None:167            model_output_vals = None168            phi = None169            if self.model.model_type == "xgboost":170                import xgboost171                if not isinstance(X, xgboost.core.DMatrix):172                    X = xgboost.DMatrix(X)173                if tree_limit == -1:174                    tree_limit = 0175                phi = self.model.original_model.predict(176                    X, ntree_limit=tree_limit, pred_contribs=True,177                    approx_contribs=approximate, validate_features=False178                )179                180                if check_additivity and self.model_output == "margin":181                    model_output_vals = self.model.original_model.predict(182                        X, ntree_limit=tree_limit, output_margin=True,183                        validate_features=False184                    )185            186            elif self.model.model_type == "lightgbm":187                assert not approximate, "approximate=True is not supported for LightGBM models!"188                phi = self.model.original_model.predict(X, num_iteration=tree_limit, pred_contrib=True)189                # Note: the data must be joined on the last axis190                if self.model.original_model.params['objective'] == 'binary':191                    warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray')192                    phi = np.concatenate((0-phi, phi), axis=-1)193                if phi.shape[1] != X.shape[1] + 1:194                    phi = phi.reshape(X.shape[0], phi.shape[1]//(X.shape[1]+1), X.shape[1]+1)195            196            elif self.model.model_type == "catboost": # thanks to the CatBoost team for implementing this...197                assert not approximate, "approximate=True is not supported for CatBoost models!"198                assert tree_limit == -1, "tree_limit is not yet supported for CatBoost models!"199                import catboost200                if type(X) != catboost.Pool:201                    X = catboost.Pool(X)202                phi = self.model.original_model.get_feature_importance(data=X, fstr_type='ShapValues')203            # note we pull off the last column and keep it as our expected_value204            if phi is not None:205                if len(phi.shape) == 3:206                    self.expected_value = [phi[0, i, -1] for i in range(phi.shape[1])]207                    out = [phi[:, i, :-1] for i in range(phi.shape[1])]208                else:209                    self.expected_value = phi[0, -1]210                    out = phi[:, :-1]211                212                if check_additivity and model_output_vals is not None:213                    self.assert_additivity(out, model_output_vals)214                return out215        # convert dataframes216        if safe_isinstance(X, "pandas.core.series.Series"):217            X = X.values218        elif safe_isinstance(X, "pandas.core.frame.DataFrame"):219            X = X.values220        flat_output = False221        if len(X.shape) == 1:222            flat_output = True223            X = X.reshape(1, X.shape[0])224        if X.dtype != self.model.input_dtype:225            X = X.astype(self.model.input_dtype)226        X_missing = np.isnan(X, dtype=np.bool)227        assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))228        assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"229        if tree_limit < 0 or tree_limit > self.model.values.shape[0]:230            tree_limit = self.model.values.shape[0]231        232        if self.model_output == "logloss":233            assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"234            assert X.shape[0] == len(y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (len(y), X.shape[0])235        transform = self.model.get_transform(self.model_output)236        if self.feature_perturbation == "tree_path_dependent":237            assert self.model.fully_defined_weighting, "The background dataset you provided does not cover all the leaves in the model, " \238                                                       "so TreeExplainer cannot run with the feature_perturbation=\"tree_path_dependent\" option! " \239                                                       "Try providing a larger background dataset, or using feature_perturbation=\"interventional\"."240 241        # run the core algorithm using the C extension242        assert_import("cext")243        phi = np.zeros((X.shape[0], X.shape[1]+1, self.model.n_outputs))244        if not approximate:245            _cext.dense_tree_shap(246                self.model.children_left, self.model.children_right, self.model.children_default,247                self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight,248                self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,249                self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],250                output_transform_codes[transform], False251            )252        else:253            _cext.dense_tree_saabas(254                self.model.children_left, self.model.children_right, self.model.children_default,255                self.model.features, self.model.thresholds, self.model.values,256                self.model.max_depth, tree_limit, self.model.base_offset, output_transform_codes[transform], 257                X, X_missing, y, phi258            )259        # note we pull off the last column and keep it as our expected_value260        if self.model.n_outputs == 1:261            if self.model_output != "logloss":262                self.expected_value = phi[0, -1, 0]263            if flat_output:264                out = phi[0, :-1, 0]265            else:266                out = phi[:, :-1, 0]267        else:268            if self.model_output != "logloss":269                self.expected_value = [phi[0, -1, i] for i in range(phi.shape[2])]270            if flat_output:271                out = [phi[0, :-1, i] for i in range(self.model.n_outputs)]272            else:273                out = [phi[:, :-1, i] for i in range(self.model.n_outputs)]274        if check_additivity and self.model_output == "margin":275            self.assert_additivity(out, self.model.predict(X))276        return out277    def shap_interaction_values(self, X, y=None, tree_limit=None):278        """ Estimate the SHAP interaction values for a set of samples.279        Parameters280        ----------281        X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)282            A matrix of samples (# samples x # features) on which to explain the model's output.283        y : numpy.array284            An array of label values for each sample. Used when explaining loss functions (not yet supported).285        tree_limit : None (default) or int 286            Limit the number of trees used by the model. By default None means no use the limit of the287            original model, and -1 means no limit.288        Returns289        -------290        For models with a single output this returns a tensor of SHAP values291        (# samples x # features x # features). The matrix (# features x # features) for each sample sums292        to the difference between the model output for that sample and the expected value of the model output293        (which is stored in the expected_value attribute of the explainer). Each row of this matrix sums to the294        SHAP value for that feature for that sample. The diagonal entries of the matrix represent the295        "main effect" of that feature on the prediction and the symmetric off-diagonal entries represent the296        interaction effects between all pairs of features for that sample. For models with vector outputs297        this returns a list of tensors, one for each output.298        """299        assert self.model_output == "margin", "Only model_output = \"margin\" is supported for SHAP interaction values right now!"300        assert self.feature_perturbation == "tree_path_dependent", "Only feature_perturbation = \"tree_path_dependent\" is supported for SHAP interaction values right now!"301        transform = "identity"302        # see if we have a default tree_limit in place.303        if tree_limit is None:304            tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit305        # shortcut using the C++ version of Tree SHAP in XGBoost306        if self.model.model_type == "xgboost":307            import xgboost308            if not isinstance(X, xgboost.core.DMatrix):309                X = xgboost.DMatrix(X)310            if tree_limit == -1:311                tree_limit = 0312            phi = self.model.original_model.predict(X, ntree_limit=tree_limit, pred_interactions=True)313            # note we pull off the last column and keep it as our expected_value314            if len(phi.shape) == 4:315                self.expected_value = [phi[0, i, -1, -1] for i in range(phi.shape[1])]316                return [phi[:, i, :-1, :-1] for i in range(phi.shape[1])]317            else:318                self.expected_value = phi[0, -1, -1]319                return phi[:, :-1, :-1]320        # convert dataframes321        if safe_isinstance(X, "pandas.core.series.Series"):322            X = X.values323        elif safe_isinstance(X, "pandas.core.frame.DataFrame"):324            X = X.values325        flat_output = False326        if len(X.shape) == 1:327            flat_output = True328            X = X.reshape(1, X.shape[0])329        if X.dtype != self.model.input_dtype:330            X = X.astype(self.model.input_dtype)331        X_missing = np.isnan(X, dtype=np.bool)332        assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))333        assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"334        if tree_limit < 0 or tree_limit > self.model.values.shape[0]:335            tree_limit = self.model.values.shape[0]336        # run the core algorithm using the C extension337        assert_import("cext")338        phi = np.zeros((X.shape[0], X.shape[1]+1, X.shape[1]+1, self.model.n_outputs))339        _cext.dense_tree_shap(340            self.model.children_left, self.model.children_right, self.model.children_default,341            self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight,342            self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,343            self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],344            output_transform_codes[transform], True345        )346        # note we pull off the last column and keep it as our expected_value347        if self.model.n_outputs == 1:348            self.expected_value = phi[0, -1, -1, 0]349            if flat_output:350                out = phi[0, :-1, :-1, 0]351            else:352                out = phi[:, :-1, :-1, 0]353        else:354            self.expected_value = [phi[0, -1, -1, i] for i in range(phi.shape[3])]355            if flat_output:356                out = [phi[0, :-1, :-1, i] for i in range(self.model.n_outputs)]357            else:358                out = [phi[:, :-1, :-1, i] for i in range(self.model.n_outputs)]359        360        return out361    def assert_additivity(self, phi, model_output):362        err_msg = "Additivity check failed in TreeExplainer! Please report this on GitHub."363        if self.feature_perturbation != "interventional":364            err_msg += " Consider retrying with the feature_perturbation='interventional' option."365        if type(phi) is list:366            for i in range(len(phi)):367                val = self.expected_value[i] + phi[i].sum(-1)368                assert np.max(np.abs(val - model_output[:,i]) / (np.abs(val) + 1e-4)) < 1e-2, err_msg369        else:370            val = self.expected_value + phi.sum(-1)371            assert np.max(np.abs(val - model_output) / (np.abs(val) + 1e-4)) < 1e-2, err_msg372class TreeEnsemble:373    """ An ensemble of decision trees.374    This object provides a common interface to many different types of models.375    """376    def __init__(self, model, data=None, data_missing=None):377        self.model_type = "internal"378        self.trees = None379        less_than_or_equal = True380        self.base_offset = 0381        self.objective = None # what we explain when explaining the loss of the model382        self.tree_output = None # what are the units of the values in the leaves of the trees383        self.internal_dtype = np.float64384        self.input_dtype = np.float64 # for sklearn we need to use np.float32 to always get exact matches to their predictions385        self.data = data386        self.data_missing = data_missing387        self.fully_defined_weighting = True # does the background dataset land in every leaf (making it valid for the tree_path_dependent method)388        self.tree_limit = None # used for limiting the number of trees we use by default (like from early stopping) 389        # we use names like keras390        objective_name_map = {391            "mse": "squared_error",392            "variance": "squared_error",393            "friedman_mse": "squared_error",394            "reg:linear": "squared_error",395            "reg:squarederror": "squared_error",396            "regression": "squared_error",397            "regression_l2": "squared_error",398            "mae": "absolute_error",399            "gini": "binary_crossentropy",400            "entropy": "binary_crossentropy",401            "binary:logistic": "binary_crossentropy",402            "binary_logloss": "binary_crossentropy",403            "binary": "binary_crossentropy"404        }405        tree_output_name_map = {406            "regression": "raw_value",407            "regression_l2": "squared_error",408            "reg:linear": "raw_value",409            "reg:squarederror": "raw_value",410            "binary:logistic": "log_odds",411            "binary_logloss": "log_odds",412            "binary": "log_odds"413        }414        if type(model) is dict and "trees" in model:415            # This allows a dictionary to be passed that represents the model.416            # this dictionary has several numerica paramters and also a list of trees417            # where each tree is a dictionary describing that tree418            if "internal_dtype" in model:419                self.internal_dtype = model["internal_dtype"]420            if "input_dtype" in model:421                self.input_dtype = model["input_dtype"]422            if "objective" in model:423                self.objective = model["objective"]424            if "tree_output" in model:425                self.tree_output = model["tree_output"]426            if "base_offset" in model:427                self.base_offset = model["base_offset"]428            self.trees = [Tree(t, data=data, data_missing=data_missing) for t in model["trees"]]429        elif type(model) is list and type(model[0]) == Tree: # old-style direct-load format430            self.trees = model431        elif safe_isinstance(model, "sklearn.ensemble.forest.RandomForestRegressor"):432            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"433            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type434            self.input_dtype = np.float32435            scaling = 1.0 / len(model.estimators_) # output is average of trees436            self.trees = [Tree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]437            self.objective = objective_name_map.get(model.criterion, None)438            self.tree_output = "raw_value"439        elif safe_isinstance(model, "sklearn.ensemble.iforest.IsolationForest"):440            self.dtype = np.float32441            scaling = 1.0 / len(model.estimators_) # output is average of trees442            self.trees = [IsoTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]443            self.tree_output = "raw_value"444        elif safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor"):445            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"446            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type447            self.input_dtype = np.float32448            scaling = 1.0 / len(model.estimators_) # output is average of trees449            self.trees = [Tree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]450            self.objective = objective_name_map.get(model.criterion, None)451            self.tree_output = "raw_value"452        elif safe_isinstance(model, "sklearn.ensemble.forest.ExtraTreesRegressor"):453            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"454            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type455            self.input_dtype = np.float32456            scaling = 1.0 / len(model.estimators_) # output is average of trees457            self.trees = [Tree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]458            self.objective = objective_name_map.get(model.criterion, None)459            self.tree_output = "raw_value"460        elif safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor"):461            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"462            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type463            self.input_dtype = np.float32464            scaling = 1.0 / len(model.estimators_) # output is average of trees465            self.trees = [Tree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]466            self.objective = objective_name_map.get(model.criterion, None)467            self.tree_output = "raw_value"468        elif safe_isinstance(model, "sklearn.tree.tree.DecisionTreeRegressor"):469            self.internal_dtype = model.tree_.value.dtype.type470            self.input_dtype = np.float32471            self.trees = [Tree(model.tree_, data=data, data_missing=data_missing)]472            self.objective = objective_name_map.get(model.criterion, None)473            self.tree_output = "raw_value"474        elif safe_isinstance(model, "sklearn.tree.tree.DecisionTreeClassifier"):475            self.internal_dtype = model.tree_.value.dtype.type476            self.input_dtype = np.float32477            self.trees = [Tree(model.tree_, normalize=True, data=data, data_missing=data_missing)]478            self.objective = objective_name_map.get(model.criterion, None)479            self.tree_output = "probability"480        elif safe_isinstance(model, "sklearn.ensemble.forest.RandomForestClassifier"):481            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"482            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type483            self.input_dtype = np.float32484            scaling = 1.0 / len(model.estimators_) # output is average of trees485            self.trees = [Tree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]486            self.objective = objective_name_map.get(model.criterion, None)487            self.tree_output = "probability"488        elif safe_isinstance(model, "sklearn.ensemble.forest.ExtraTreesClassifier"): # TODO: add unit test for this case489            assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"490            self.internal_dtype = model.estimators_[0].tree_.value.dtype.type491            self.input_dtype = np.float32492            scaling = 1.0 / len(model.estimators_) # output is average of trees493            self.trees = [Tree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]494            self.objective = objective_name_map.get(model.criterion, None)495            self.tree_output = "probability"496        elif safe_isinstance(model, "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"):497            self.input_dtype = np.float32498            # currently we only support the mean and quantile estimators499            if safe_isinstance(model.init_, "sklearn.ensemble.gradient_boosting.MeanEstimator"):500                self.base_offset = model.init_.mean501            elif safe_isinstance(model.init_, "sklearn.ensemble.gradient_boosting.QuantileEstimator"):502                self.base_offset = model.init_.quantile503            elif safe_isinstance(model.init_, "sklearn.dummy.DummyRegressor"):504                self.base_offset = model.init_.constant_[0]505            else:506                assert False, "Unsupported init model type: " + str(type(model.init_))507            self.trees = [Tree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]]508            self.objective = objective_name_map.get(model.criterion, None)509            self.tree_output = "raw_value"510        elif safe_isinstance(model, "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"):511            self.input_dtype = np.float32512            # TODO: deal with estimators for each class513            if model.estimators_.shape[1] > 1:514                assert False, "GradientBoostingClassifier is only supported for binary classification right now!"515            516            # currently we only support the logs odds estimator517            if safe_isinstance(model.init_, "sklearn.ensemble.gradient_boosting.LogOddsEstimator"):518                self.base_offset = model.init_.prior519                self.tree_output = "log_odds"520            elif safe_isinstance(model.init_, "sklearn.dummy.DummyClassifier"):521                self.base_offset = scipy.special.logit(model.init_.class_prior_[1]) # with two classes the trees only model the second class522                self.tree_output = "log_odds"523            else:524                assert False, "Unsupported init model type: " + str(type(model.init_))525            self.trees = [Tree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]]526            self.objective = objective_name_map.get(model.criterion, None)527        elif "pyspark.ml" in str(type(model)):528            assert_import("pyspark")529            self.original_model = model530            self.model_type = "pyspark"531            # model._java_obj.getImpurity() can be gini, entropy or variance.532            self.objective = objective_name_map.get(model._java_obj.getImpurity(), None)533            if "Classification" in str(type(model)):534                normalize = True535                self.tree_output = "probability"536            else:537                normalize = False538                self.tree_output = "raw_value"539            # Spark Random forest, create 1 weighted (avg) tree per sub-model540            if safe_isinstance(model, "pyspark.ml.classification.RandomForestClassificationModel") \541                    or safe_isinstance(model, "pyspark.ml.regression.RandomForestRegressionModel"):542                sum_weight = sum(model.treeWeights)  # output is average of trees543                self.trees = [Tree(tree, normalize=normalize, scaling=model.treeWeights[i]/sum_weight) for i, tree in enumerate(model.trees)]544            # Spark GBT, create 1 weighted (learning rate) tree per sub-model545            elif safe_isinstance(model, "pyspark.ml.classification.GBTClassificationModel") \546                    or safe_isinstance(model, "pyspark.ml.regression.GBTRegressionModel"):547                self.objective = "squared_error" # GBT subtree use the variance548                self.tree_output = "raw_value"549                self.trees = [Tree(tree, normalize=False, scaling=model.treeWeights[i]) for i, tree in enumerate(model.trees)]550            # Spark Basic model (single tree)551            elif safe_isinstance(model, "pyspark.ml.classification.DecisionTreeClassificationModel") \552                    or safe_isinstance(model, "pyspark.ml.regression.DecisionTreeRegressionModel"):553                self.trees = [Tree(model, normalize=normalize, scaling=1)]554            else:555                assert False, "Unsupported Spark model type: " + str(type(model))556        elif safe_isinstance(model, "xgboost.core.Booster"):557            import xgboost558            self.original_model = model559            self.model_type = "xgboost"560            xgb_loader = XGBTreeModelLoader(self.original_model)561            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)562            self.base_offset = xgb_loader.base_score563            less_than_or_equal = False564            self.objective = objective_name_map.get(xgb_loader.name_obj, None)565            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)566        elif safe_isinstance(model, "xgboost.sklearn.XGBClassifier"):567            import xgboost568            self.input_dtype = np.float32569            self.model_type = "xgboost"570            self.original_model = model.get_booster()571            xgb_loader = XGBTreeModelLoader(self.original_model)572            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)573            self.base_offset = xgb_loader.base_score574            less_than_or_equal = False575            self.objective = objective_name_map.get(xgb_loader.name_obj, None)576            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)577            self.tree_limit = getattr(model, "best_ntree_limit", None)578        elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"):579            import xgboost580            self.original_model = model.get_booster()581            self.model_type = "xgboost"582            xgb_loader = XGBTreeModelLoader(self.original_model)583            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)584            self.base_offset = xgb_loader.base_score585            less_than_or_equal = False586            self.objective = objective_name_map.get(xgb_loader.name_obj, None)587            self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)588            self.tree_limit = getattr(model, "best_ntree_limit", None)589        elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"):590            import xgboost591            self.original_model = model.get_booster()592            self.model_type = "xgboost"593            xgb_loader = XGBTreeModelLoader(self.original_model)594            self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)595            self.base_offset = xgb_loader.base_score596            less_than_or_equal = False597            # Note: for ranker, leaving tree_output and objective as None as they598            # are not implemented in native code yet599            self.tree_limit = getattr(model, "best_ntree_limit", None)600        elif safe_isinstance(model, "lightgbm.basic.Booster"):601            assert_import("lightgbm")602            self.model_type = "lightgbm"603            self.original_model = model604            tree_info = self.original_model.dump_model()["tree_info"]605            try:606                self.trees = [Tree(e, data=data, data_missing=data_missing) for e in tree_info]607            except:608                self.trees = None # we get here because the cext can't handle categorical splits yet609            610            self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)611            self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)612            613        elif safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor"):614            assert_import("lightgbm")615            self.model_type = "lightgbm"616            self.original_model = model.booster_617            tree_info = self.original_model.dump_model()["tree_info"]618            try:619                self.trees = [Tree(e, data=data, data_missing=data_missing) for e in tree_info]620            except:621                self.trees = None # we get here because the cext can't handle categorical splits yet622            self.objective = objective_name_map.get(model.objective, None)623            self.tree_output = tree_output_name_map.get(model.objective, None)624            if model.objective is None:625                self.objective = "squared_error"626                self.tree_output = "raw_value"627        elif safe_isinstance(model, "lightgbm.sklearn.LGBMRanker"):628            assert_import("lightgbm")629            self.model_type = "lightgbm"630            self.original_model = model.booster_631            tree_info = self.original_model.dump_model()["tree_info"]632            try:633                self.trees = [Tree(e, data=data, data_missing=data_missing) for e in tree_info]634            except:635                self.trees = None # we get here because the cext can't handle categorical splits yet636            # Note: for ranker, leaving tree_output and objective as None as they637            # are not implemented in native code yet638        elif safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier"):639            assert_import("lightgbm")640            self.model_type = "lightgbm"641            self.original_model = model.booster_642            tree_info = self.original_model.dump_model()["tree_info"]643            try:644                self.trees = [Tree(e, data=data, data_missing=data_missing) for e in tree_info]645            except:646                self.trees = None # we get here because the cext can't handle categorical splits yet647            self.objective = objective_name_map.get(model.objective, None)648            self.tree_output = tree_output_name_map.get(model.objective, None)649            if model.objective is None:650                self.objective = "binary_crossentropy"651                self.tree_output = "log_odds"652        elif safe_isinstance(model, "catboost.core.CatBoostRegressor"):653            assert_import("catboost")654            self.model_type = "catboost"655            self.original_model = model656        elif safe_isinstance(model, "catboost.core.CatBoostClassifier"):657            assert_import("catboost")658            self.model_type = "catboost"659            self.original_model = model660            self.input_dtype = np.float32661            cb_loader = CatBoostTreeModelLoader(model)662            self.trees = cb_loader.get_trees(data=data, data_missing=data_missing)663            self.tree_output = "log_odds"664            self.objective = "binary_crossentropy"665        elif safe_isinstance(model, "catboost.core.CatBoost"):666            assert_import("catboost")667            self.model_type = "catboost"668            self.original_model = model669        elif safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"):670            self.input_dtype = np.float32671            scaling = 1.0 / len(model.estimators_) # output is average of trees672            self.trees = [Tree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]673            self.objective = objective_name_map.get(model.criterion, None)674            self.tree_output = "probability"675        else:676            raise Exception("Model type not yet supported by TreeExplainer: " + str(type(model)))677        678        # build a dense numpy version of all the tree objects679        if self.trees is not None and self.trees:680            max_nodes = np.max([len(t.values) for t in self.trees])681            assert len(np.unique([t.values.shape[1] for t in self.trees])) == 1, "All trees in the ensemble must have the same output dimension!"682            ntrees = len(self.trees)683            self.n_outputs = self.trees[0].values.shape[1]684            # important to be -1 in unused sections!! This way we can tell which entries are valid.685            self.children_left = -np.ones((ntrees, max_nodes), dtype=np.int32)686            self.children_right = -np.ones((ntrees, max_nodes), dtype=np.int32)687            self.children_default = -np.ones((ntrees, max_nodes), dtype=np.int32)688            self.features = -np.ones((ntrees, max_nodes), dtype=np.int32)689            self.thresholds = np.zeros((ntrees, max_nodes), dtype=self.internal_dtype)690            self.values = np.zeros((ntrees, max_nodes, self.trees[0].values.shape[1]), dtype=self.internal_dtype)691            self.node_sample_weight = np.zeros((ntrees, max_nodes), dtype=self.internal_dtype)692            693            for i in range(ntrees):694                l = len(self.trees[i].features)695                self.children_left[i,:l] = self.trees[i].children_left696                self.children_right[i,:l] = self.trees[i].children_right697                self.children_default[i,:l] = self.trees[i].children_default698                self.features[i,:l] = self.trees[i].features699                self.thresholds[i,:l] = self.trees[i].thresholds700                self.values[i,:l,:] = self.trees[i].values701                self.node_sample_weight[i,:l] = self.trees[i].node_sample_weight702                # ensure that the passed background dataset lands in every leaf703                if np.min(self.trees[i].node_sample_weight) <= 0:704                    self.fully_defined_weighting = False705            706            # If we should do <= then we nudge the thresholds to make our <= work like <707            if not less_than_or_equal:708                self.thresholds = np.nextafter(self.thresholds, -np.inf)709            710            self.num_nodes = np.array([len(t.values) for t in self.trees], dtype=np.int32)711            self.max_depth = np.max([t.max_depth for t in self.trees])712    def get_transform(self, model_output):713        """ A consistent interface to make predictions from this model.714        """715        if model_output == "margin":716            transform = "identity"717        elif model_output == "probability":718            if self.tree_output == "log_odds":719                transform = "logistic"720            elif self.tree_output == "probability":721                transform = "identity"722            else:723                raise Exception("model_output = \"probability\" is not yet supported when model.tree_output = \"" + self.tree_output + "\"!")724        elif model_output == "logloss":725            if self.objective == "squared_error":726                transform = "squared_loss"727            elif self.objective == "binary_crossentropy":728                transform = "logistic_nlogloss"729            else:730                raise Exception("model_output = \"logloss\" is not yet supported when model.objective = \"" + self.objective + "\"!")731        else:732            assert False, "Unrecognized model_output parameter value: " + model_output733            734        return transform735    def predict(self, X, y=None, output="margin", tree_limit=None):736        """ A consistent interface to make predictions from this model.737        Parameters738        ----------739        tree_limit : None (default) or int 740            Limit the number of trees used by the model. By default None means no use the limit of the741            original model, and -1 means no limit.742        """743        if self.model_type == "pyspark":744            import pyspark745            #TODO support predict for pyspark746            raise NotImplementedError("Predict with pyspark isn't implemented")747        # see if we have a default tree_limit in place.748        if tree_limit is None:749            tree_limit = -1 if self.tree_limit is None else self.tree_limit750        # convert dataframes751        if safe_isinstance(X, "pandas.core.series.Series"):752            X = X.values753        elif safe_isinstance(X, "pandas.core.frame.DataFrame"):754            X = X.values755        flat_output = False756        if len(X.shape) == 1:757            flat_output = True758            X = X.reshape(1, X.shape[0])759        if X.dtype.type != self.input_dtype:760            X = X.astype(self.input_dtype)761        X_missing = np.isnan(X, dtype=np.bool)762        assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))763        assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"764        if tree_limit < 0 or tree_limit > self.values.shape[0]:765            tree_limit = self.values.shape[0]766        if output == "logloss":767            assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"768            assert X.shape[0] == len(y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (len(y), X.shape[0])769        transform = self.get_transform(output)770        771        if True or self.model_type == "internal":772            output = np.zeros((X.shape[0], self.n_outputs))773            assert_import("cext")774            _cext.dense_tree_predict(775                self.children_left, self.children_right, self.children_default,776                self.features, self.thresholds, self.values,777                self.max_depth, tree_limit, self.base_offset, output_transform_codes[transform], 778                X, X_missing, y, output779            )780        elif self.model_type == "xgboost":781            import xgboost782            output = self.original_model.predict(X, output_margin=True, tree_limit=tree_limit)783        # drop dimensions we don't need784        if flat_output:785            if self.n_outputs == 1:786                return output.flatten()[0]787            else:788                return output.reshape(-1, self.n_outputs)789        else:790            if self.n_outputs == 1:791                return output.flatten()792            else:793                return output794class Tree:795    """ A single decision tree.796    The primary point of this object is to parse many different tree types into a common format.797    """798    def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None):799        assert_import("cext")800        if safe_isinstance(tree, "sklearn.tree._tree.Tree"):801            self.children_left = tree.children_left.astype(np.int32)802            self.children_right = tree.children_right.astype(np.int32)803            self.children_default = self.children_left # missing values not supported in sklearn804            self.features = tree.feature.astype(np.int32)805            self.thresholds = tree.threshold.astype(np.float64)806            self.values = tree.value.reshape(tree.value.shape[0], tree.value.shape[1] * tree.value.shape[2])807            if normalize:808                self.values = (self.values.T / self.values.sum(1)).T809            self.values = self.values * scaling810            self.node_sample_weight = tree.weighted_n_node_samples.astype(np.float64)811        elif type(tree) is dict and 'features' in tree:812            self.children_left = tree["children_left"].astype(np.int32)813            self.children_right = tree["children_right"].astype(np.int32)814            self.children_default = tree["children_default"].astype(np.int32)815            self.features = tree["features"].astype(np.int32)816            self.thresholds = tree["thresholds"]817            self.values = tree["values"] * scaling818            self.node_sample_weight = tree["node_sample_weight"]819        # deprecated dictionary support (with sklearn singlular style "feature" and "value" names)820        elif type(tree) is dict and 'children_left' in tree:821            self.children_left = tree["children_left"].astype(np.int32)822            self.children_right = tree["children_right"].astype(np.int32)823            self.children_default = tree["children_default"].astype(np.int32)824            self.features = tree["feature"].astype(np.int32)825            self.thresholds = tree["threshold"]826            self.values = tree["value"] * scaling827            self.node_sample_weight = tree["node_sample_weight"]828        elif safe_isinstance(tree, "pyspark.ml.classification.DecisionTreeClassificationModel") \829                or safe_isinstance(tree, "pyspark.ml.regression.DecisionTreeRegressionModel"):830            #model._java_obj.numNodes() doesn't give leaves, need to recompute the size831            def getNumNodes(node, size):832                size = size + 1833                if node.subtreeDepth() == 0:834                    return size835                else:836                    size = getNumNodes(node.leftChild(), size)837                    return getNumNodes(node.rightChild(), size)838            num_nodes = getNumNodes(tree._java_obj.rootNode(), 0)839            self.children_left = np.full(num_nodes, -2, dtype=np.int32)840            self.children_right = np.full(num_nodes, -2, dtype=np.int32)841            self.children_default = np.full(num_nodes, -2, dtype=np.int32)842            self.features = np.full(num_nodes, -2, dtype=np.int32)843            self.thresholds = np.full(num_nodes, -2, dtype=np.float64)844            self.values = [-2]*num_nodes845            self.node_sample_weight = np.full(num_nodes, -2, dtype=np.float64)846            def buildTree(index, node):847                index = index + 1848                if tree._java_obj.getImpurity() == 'variance':849                    self.values[index] = [node.prediction()] #prediction for the node850                else:851                    self.values[index] = [e for e in node.impurityStats().stats()] #for gini: NDarray(numLabel): 1 per label: number of item for each label which went through this node852                self.node_sample_weight[index] = node.impurityStats().count() #weighted count of element trough this node853                if node.subtreeDepth() == 0:854                    return index855                else:856                    self.features[index] = node.split().featureIndex() #index of the feature we split on, not available for leaf, int857                    if str(node.split().getClass()).endswith('tree.CategoricalSplit'):858                        #Categorical split isn't implemented, TODO: could fake it by creating a fake node to split on the exact value?859                        raise NotImplementedError('CategoricalSplit are not yet implemented')860                    self.thresholds[index] = node.split().threshold() #threshold for the feature, not available for leaf, float861                    self.children_left[index] = index + 1862                    idx = buildTree(index, node.leftChild())863                    self.children_right[index] = idx + 1864                    idx = buildTree(idx, node.rightChild())865                    return idx866            buildTree(-1, tree._java_obj.rootNode())867            #default Not supported with mlib? (TODO)868            self.children_default = self.children_left869            self.values = np.asarray(self.values)870            if normalize:871                self.values = (self.values.T / self.values.sum(1)).T872            self.values = self.values * scaling873        elif type(tree) == dict and 'tree_structure' in tree:874            start = tree['tree_structure']875            num_parents = tree['num_leaves']-1876            self.children_left = np.empty((2*num_parents+1), dtype=np.int32)877            self.children_right = np.empty((2*num_parents+1), dtype=np.int32)878            self.children_default = np.empty((2*num_parents+1), dtype=np.int32)879            self.features = np.empty((2*num_parents+1), dtype=np.int32)880            self.thresholds = np.empty((2*num_parents+1), dtype=np.float64)881            self.values = [-2]*(2*num_parents+1)882            self.node_sample_weight = np.empty((2*num_parents+1), dtype=np.float64)883            visited, queue = [], [start]884            while queue:885                vertex = queue.pop(0)886                if 'split_index' in vertex.keys():887                    if vertex['split_index'] not in visited:888                        if 'split_index' in vertex['left_child'].keys():889                            self.children_left[vertex['split_index']] = vertex['left_child']['split_index']890                        else:891                            self.children_left[vertex['split_index']] = vertex['left_child']['leaf_index']+num_parents892                        if 'split_index' in vertex['right_child'].keys():893                            self.children_right[vertex['split_index']] = vertex['right_child']['split_index']894                        else:895                            self.children_right[vertex['split_index']] = vertex['right_child']['leaf_index']+num_parents896                        if vertex['default_left']:897                            self.children_default[vertex['split_index']] = self.children_left[vertex['split_index']]898                        else:899                            self.children_default[vertex['split_index']] = self.children_right[vertex['split_index']]900                        self.features[vertex['split_index']] = vertex['split_feature']901                        self.thresholds[vertex['split_index']] = vertex['threshold']902                        self.values[vertex['split_index']] = [vertex['internal_value']]903                        self.node_sample_weight[vertex['split_index']] = vertex['internal_count']904                        visited.append(vertex['split_index'])905                        queue.append(vertex['left_child'])906                        queue.append(vertex['right_child'])907                else:908                    self.children_left[vertex['leaf_index']+num_parents] = -1909                    self.children_right[vertex['leaf_index']+num_parents] = -1910                    self.children_default[vertex['leaf_index']+num_parents] = -1911                    self.features[vertex['leaf_index']+num_parents] = -1912                    self.children_left[vertex['leaf_index']+num_parents] = -1913                    self.children_right[vertex['leaf_index']+num_parents] = -1914                    self.children_default[vertex['leaf_index']+num_parents] = -1915                    self.features[vertex['leaf_index']+num_parents] = -1916                    self.thresholds[vertex['leaf_index']+num_parents] = -1917                    self.values[vertex['leaf_index']+num_parents] = [vertex['leaf_value']]918                    self.node_sample_weight[vertex['leaf_index']+num_parents] = vertex['leaf_count']919            self.values = np.asarray(self.values)920            self.values = np.multiply(self.values, scaling)921        922        elif type(tree) == dict and 'nodeid' in tree:923            """ Directly create tree given the JSON dump (with stats) of a XGBoost model.924            """925            def max_id(node):926                if "children" in node:927                    return max(node["nodeid"], *[max_id(n) for n in node["children"]])928                else:929                    return node["nodeid"]930            931            m = max_id(tree) + 1932            self.children_left = -np.ones(m, dtype=np.int32)933            self.children_right = -np.ones(m, dtype=np.int32)934            self.children_default = -np.ones(m, dtype=np.int32)935            self.features = -np.ones(m, dtype=np.int32)936            self.thresholds = np.zeros(m, dtype=np.float64)937            self.values = np.zeros((m, 1), dtype=np.float64)938            self.node_sample_weight = np.empty(m, dtype=np.float64)939            def extract_data(node, tree):940                i = node["nodeid"]941                tree.node_sample_weight[i] = node["cover"]942                if "children" in node:943                    tree.children_left[i] = node["yes"]944                    tree.children_right[i] = node["no"]945                    tree.children_default[i] = node["missing"]946                    tree.features[i] = node["split"]947                    tree.thresholds[i] = node["split_condition"]948                    for n in node["children"]:949                        extract_data(n, tree)950                elif "leaf" in node:951                    tree.values[i] = node["leaf"] * scaling952            extract_data(tree, self)953    954        elif type(tree) == str:955            """ Build a tree from a text dump (with stats) of xgboost.956            """957            nodes = [t.lstrip() for t in tree[:-1].split("\n")]958            nodes_dict = {}959            for n in nodes: nodes_dict[int(n.split(":")[0])] = n.split(":")[1]960            m = max(nodes_dict.keys())+1961            children_left = -1*np.ones(m,dtype="int32")962            children_right = -1*np.ones(m,dtype="int32")963            children_default = -1*np.ones(m,dtype="int32")964            features = -2*np.ones(m,dtype="int32")965            thresholds = -1*np.ones(m,dtype="float64")966            values = 1*np.ones(m,dtype="float64")967            node_sample_weight = np.zeros(m,dtype="float64")968            values_lst = list(nodes_dict.values())969            keys_lst = list(nodes_dict.keys())970            for i in range(0,len(keys_lst)):971                value = values_lst[i]972                key = keys_lst[i]973                if ("leaf" in value):974                    # Extract values975                    val = float(value.split("leaf=")[1].split(",")[0])976                    node_sample_weight_val = float(value.split("cover=")[1])977                    # Append to lists978                    values[key] = val979                    node_sample_weight[key] = node_sample_weight_val980                else:981                    c_left = int(value.split("yes=")[1].split(",")[0])982                    c_right = int(value.split("no=")[1].split(",")[0])983                    c_default = int(value.split("missing=")[1].split(",")[0])984                    feat_thres = value.split(" ")[0]985                    if ("<" in feat_thres):986                        feature = int(feat_thres.split("<")[0][2:])987                        threshold = float(feat_thres.split("<")[1][:-1])988                    if ("=" in feat_thres):989                        feature = int(feat_thres.split("=")[0][2:])990                        threshold = float(feat_thres.split("=")[1][:-1])991                    node_sample_weight_val = float(value.split("cover=")[1].split(",")[0])992                    children_left[key] = c_left993                    children_right[key] = c_right994                    children_default[key] = c_default995                    features[key] = feature996                    thresholds[key] = threshold997                    node_sample_weight[key] = node_sample_weight_val998            999            self.children_left = children_left1000            self.children_right = children_right1001            self.children_default = children_default1002            self.features = features1003            self.thresholds = thresholds1004            self.values = values[:,np.newaxis] * scaling1005            self.node_sample_weight = node_sample_weight1006        else:1007            raise Exception("Unknown input to Tree constructor!")1008        1009        # Re-compute the number of samples that pass through each node if we are given data1010        if data is not None and data_missing is not None:1011            self.node_sample_weight[:] = 0.01012            _cext.dense_tree_update_weights(1013                self.children_left, self.children_right, self.children_default, self.features,1014                self.thresholds, self.values, 1, self.node_sample_weight, data, data_missing1015            )1016        1017        # we compute the expectations to make sure they follow the SHAP logic1018        self.max_depth = _cext.compute_expectations(1019            self.children_left, self.children_right, self.node_sample_weight,1020            self.values1021        )1022class IsoTree(Tree):1023    """ 1024    In sklearn the tree of the Isolation Forest does not calculated in a good way.1025    """1026    def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None):1027        super(IsoTree, self).__init__(tree, normalize, scaling, data, data_missing)1028        if safe_isinstance(tree, "sklearn.tree._tree.Tree"):1029            from sklearn.ensemble.iforest import _average_path_length1030            def _recalculate_value(tree, i , level):1031                if tree.children_left[i] == -1 and tree.children_right[i] == -1:1032                    value = level + _average_path_length(np.array([tree.n_node_samples[i]]))[0]1033                    self.values[i, 0] =  value1034                    return value * tree.n_node_samples[i]1035                else:1036                    value_left = _recalculate_value(tree, tree.children_left[i] , level + 1)1037                    value_right = _recalculate_value(tree, tree.children_right[i] , level + 1)1038                    self.values[i, 0] =  (value_left + value_right) / tree.n_node_samples[i]1039                    return value_left + value_right1040            _recalculate_value(tree, 0, 0)1041            if normalize:1042                self.values = (self.values.T / self.values.sum(1)).T...

utils.py

Source:utils.py

...767778def is_tree_model(model):79    if type(model) is dict and "trees" in model or \80            safe_isinstance(model,81                            ["sklearn.ensemble.RandomForestRegressor", "sklearn.ensemble.forest.RandomForestRegressor"]) \82            or safe_isinstance(model, ["sklearn.ensemble.IsolationForest", "sklearn.ensemble.iforest.IsolationForest"]) \83            or safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor") \84            or safe_isinstance(model,85                               ["sklearn.ensemble.ExtraTreesRegressor", "sklearn.ensemble.forest.ExtraTreesRegressor"]) \86            or safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor") \87            or safe_isinstance(model, ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor"]) \88            or safe_isinstance(model,89                               ["sklearn.tree.DecisionTreeClassifier", "sklearn.tree.tree.DecisionTreeClassifier"]) \90            or safe_isinstance(model, ["sklearn.ensemble.RandomForestClassifier",91                                       "sklearn.ensemble.forest.RandomForestClassifier"]) \92            or safe_isinstance(model, ["sklearn.ensemble.ExtraTreesClassifier",93                                       "sklearn.ensemble.forest.ExtraTreesClassifier"]) \94            or safe_isinstance(model, ["sklearn.ensemble.GradientBoostingRegressor",95                                       "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"]) \96            or safe_isinstance(model, ["sklearn.ensemble.GradientBoostingClassifier",97                                       "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"]) \98            or safe_isinstance(model, "xgboost.core.Booster") \99            or safe_isinstance(model, "xgboost.sklearn.XGBClassifier") \100            or safe_isinstance(model, "xgboost.sklearn.XGBRegressor") \101            or safe_isinstance(model, "xgboost.sklearn.XGBRanker") \102            or safe_isinstance(model, "lightgbm.basic.Booster") \103            or safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor") \104            or safe_isinstance(model, "lightgbm.sklearn.LGBMRanker") \105            or safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier") \106            or safe_isinstance(model, "catboost.core.CatBoostRegressor") \107            or safe_isinstance(model, "catboost.core.CatBoostClassifier") \108            or safe_isinstance(model, "catboost.core.CatBoost") \109            or safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"):110        return True111    else:112        return False113114115def check_empty(d, errstr ='the input is empty'):116    if d is None:117        raise ValueError(errstr)118119# binning function120def bin_me(act, pred, n_bins):121    "bin values in arrays act and pred into (n_bins+1) bins and return aggregated values in a data frame"122123    n = act.size
...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.