Best Python code snippet using molecule_python
ml.py
Source:ml.py  
1import hashlib2import matplotlib.pyplot as plt3import numpy as np4import pandas as pd5from sklearn import svm6from sklearn.ensemble import AdaBoostClassifier7from sklearn.ensemble import ExtraTreesClassifier8from sklearn.ensemble import GradientBoostingClassifier9from sklearn.ensemble import RandomForestClassifier10from sklearn.linear_model import LogisticRegression11from sklearn.linear_model import SGDClassifier12from sklearn.metrics import *13from sklearn.metrics import roc_curve, auc, confusion_matrix14from sklearn.model_selection import ParameterGrid15from sklearn.naive_bayes import GaussianNB16from sklearn.neighbors import KNeighborsClassifier17from sklearn.pipeline import Pipeline18from sklearn.tree import DecisionTreeClassifier19import categorical_encoder as cat_encode20MODELS_TO_RUN = ['RF', 'DT', 'LR', 'SVM', 'KNN']21FEATURES = ['platform', 'visitorType', 'CategoryID']22PATH_RESULTS = "./results/"23PATH_IMAGES = "./images/"24# Simple undersampling of the majority class to ensure balanced data set used for training/validation25def under_sampling(df: pd.DataFrame, response_col: str):26    df_one = df.loc[df[response_col] == 1]27    df_zero = df.loc[df[response_col] == 0]28    if len(df_one) < len(df_zero):29        df_zero = df_zero.sample(n=len(df_one))30    else:31        df_one = df_one.sample(n=len(df_zero))32    return pd.concat([df_zero, df_one])33def prepare_input(df: pd.DataFrame):34    """35    Prepare key inputs for the later steps of the pipeline36    Input:37    - df: pandas dataframe38    Output:39    - train data transformed, test data transformed, original train data, original test data,40    categorical pipeline41    """42    df['NewID'] = df.index43    train_set, test_set = split_train_test_by_id(df, 0.3, 'NewID')44    cat_attribs = [FEATURES[0], FEATURES[1], FEATURES[2]]45    cat_pipeline = Pipeline([46        ('selector', cat_encode.DataFrameSelector(cat_attribs)),47        ('cat_encoder', cat_encode.CategoricalEncoder(encoding="onehot-dense")),48    ])49    train_set_num = train_set[FEATURES]50    train_prepared = cat_pipeline.fit_transform(train_set_num)51    test_set_num = test_set[FEATURES]52    test_prepared = cat_pipeline.transform(test_set_num)53    return train_prepared, test_prepared, train_set, test_set, cat_pipeline54def find_best_model(df: pd.DataFrame, grid_size: str, outcome_var: str, file_name=None):55    """56    Use grid search to find best model57    Input:58    - df: pandas dataframe59    - grid_size: one of 3 possible values: 'test', 'small', 'large'60    - outcome_var: the outcome variable61    - file_name: file name of the csv file containing the results62    Output:63    - either return a dataframe or save results as csv file64    """65    clfs, grid = define_clfs_params(grid_size)66    # define models to run67    models_to_run = MODELS_TO_RUN68    # call clf_loop and store results in results_df69    train_prepared, test_prepared, train_set, test_set, cat_pipeline = prepare_input(df)70    results_df = clf_loop(models_to_run, clfs, grid, train_prepared, test_prepared,71                          train_set[outcome_var], test_set[outcome_var])72    # save to csv73    if file_name:74        file_name = PATH_RESULTS + file_name75        results_df.to_csv(file_name, index=False)76    else:77        return results_df78# Calculate AUC score for 1-feature decision tree as baseline results79def baseline_model(df: pd.DataFrame, outcome_var: str):80    train_prepared, test_prepared, train_set, test_set, cat_pipeline = prepare_input(df)81    dec_tree = DecisionTreeClassifier(max_depth=1, min_samples_split=10)82    y_pred_probs = dec_tree.fit(train_prepared, train_set[outcome_var]).predict_proba(test_prepared)[:, 1]83    print("AUC score of 1-feature decision tree: " + str(roc_auc_score(test_set[outcome_var], y_pred_probs)))84# Fit the best model and plot ROC graph85def fit_random_forest(df: pd.DataFrame, outcome_var: str):86    train_prepared, test_prepared, train_set, test_set, cat_pipeline = prepare_input(df)87    model = RandomForestClassifier(max_depth=5, max_features='sqrt', min_samples_split=2,88                                   n_estimators=10, n_jobs=-1)89    model.fit(train_prepared, train_set[outcome_var])90    model_preds = model.predict_proba(test_prepared)91    prob_true = model_preds[::, 1]92    plot_roc("RandomForest", prob_true, test_set[outcome_var], "save")93# Code from line 109 to line 321 is adapted from Rayid Ghani's github: https://github.com/rayidghani/magicloops94# Plot the ROC curve95def plot_roc(name, probs, true, output_type):96    fpr, tpr, thresholds = roc_curve(true, probs)97    roc_auc = auc(fpr, tpr)98    plt.clf()99    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)100    plt.plot([0, 1], [0, 1], 'k--')101    plt.xlim([0.0, 1.05])102    plt.ylim([0.0, 1.05])103    plt.xlabel('False Positive Rate')104    plt.ylabel('True Positive Rate')105    plt.title(name)106    plt.legend(loc="lower right")107    if output_type == 'save':108        plt.savefig(PATH_IMAGES + name + '_roc.png')109    else:110        plt.show()111# Generate binary prediction at a specified cutoff point defined as k percent of the sample112# Only apply to y sorted113def generate_binary_at_k(y_scores, k):114    cutoff_index = int(len(y_scores) * (k / 100.0))115    predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))]116    return predictions_binary117def joint_sort_descending(l1, l2):118    # l1 and l2 have to be numpy arrays119    idx = np.argsort(l1)[::-1]120    return l1[idx], l2[idx]121# Calculate precision at k122def precision_at_k(y_true, y_scores, k):123    y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))124    preds_at_k = generate_binary_at_k(y_scores_sorted, k)125    precision = precision_score(y_true_sorted, preds_at_k)126    return precision127# Calculate recall at k128def recall_at_k(y_true, y_scores, k):129    y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))130    preds_at_k = generate_binary_at_k(y_scores_sorted, k)131    recall = recall_score(y_true_sorted, preds_at_k)132    return recall133# Create confusion matrix134def create_confusion_matrix(y_true, y_scores, k):135    y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))136    preds_at_k = generate_binary_at_k(y_scores_sorted, k)137    table = confusion_matrix(y_true_sorted, preds_at_k)138    return table139# Plot precision recall curve140def plot_precision_recall_n(y_true, y_prob, model_name, output_type):141    from sklearn.metrics import precision_recall_curve142    y_score = y_prob143    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)144    precision_curve = precision_curve[:-1]145    recall_curve = recall_curve[:-1]146    pct_above_per_thresh = []147    number_scored = len(y_score)148    for value in pr_thresholds:149        num_above_thresh = len(y_score[y_score >= value])150        pct_above_thresh = num_above_thresh / float(number_scored)151        pct_above_per_thresh.append(pct_above_thresh)152    pct_above_per_thresh = np.array(pct_above_per_thresh)153    plt.clf()154    fig, ax1 = plt.subplots()155    ax1.plot(pct_above_per_thresh, precision_curve, 'b')156    ax1.set_xlabel('percent of population')157    ax1.set_ylabel('precision', color='b')158    ax2 = ax1.twinx()159    ax2.plot(pct_above_per_thresh, recall_curve, 'r')160    ax2.set_ylabel('recall', color='r')161    ax1.set_ylim([0, 1])162    ax1.set_ylim([0, 1])163    ax2.set_xlim([0, 1])164    name = model_name165    plt.title(name)166    if output_type == 'save':167        plt.savefig(name)168    elif output_type == 'show':169        plt.show()170    else:171        plt.show()172def define_clfs_params(grid_size):173    """Define defaults for different classifiers.174    Define three types of grids:175    Test: for testing your code176    Small: small grid177    Large: Larger grid that has a lot more parameter sweeps178    """179    clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),180            'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),181            'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),182            'LR': LogisticRegression(penalty='l1', C=1e5),183            'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),184            'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),185            'NB': GaussianNB(),186            'DT': DecisionTreeClassifier(),187            'SGD': SGDClassifier(loss="hinge", penalty="l2"),188            'KNN': KNeighborsClassifier(n_neighbors=3)189            }190    large_grid = {191        'RF': {'n_estimators': [1, 10, 100, 1000, 10000], 'max_depth': [1, 5, 10, 20, 50, 100],192               'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10], 'n_jobs': [-1]},193        'LR': {'penalty': ['l1', 'l2'], 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]},194        'SGD': {'loss': ['hinge', 'log', 'perceptron'], 'penalty': ['l2', 'l1', 'elasticnet']},195        'ET': {'n_estimators': [1, 10, 100, 1000, 10000], 'criterion': ['gini', 'entropy'],196               'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10],197               'n_jobs': [-1]},198        'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100, 1000, 10000]},199        'GB': {'n_estimators': [1, 10, 100, 1000, 10000], 'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],200               'subsample': [0.1, 0.5, 1.0], 'max_depth': [1, 3, 5, 10, 20, 50, 100]},201        'NB': {},202        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'min_samples_split': [2, 5, 10]},203        'SVM': {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear']},204        'KNN': {'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'],205                'algorithm': ['auto', 'ball_tree', 'kd_tree']}206    }207    small_grid = {208        'RF': {'n_estimators': [10, 100], 'max_depth': [5, 50], 'max_features': ['sqrt', 'log2'],209               'min_samples_split': [2, 10], 'n_jobs': [-1]},210        'LR': {'penalty': ['l1', 'l2'], 'C': [0.00001, 0.001, 0.1, 1, 10], 'solver': ['liblinear']},211        'SGD': {'loss': ['log', 'perceptron'], 'penalty': ['l2', 'l1', 'elasticnet']},212        'ET': {'n_estimators': [10, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [5, 50],213               'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 10], 'n_jobs': [-1]},214        'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100, 1000, 10000]},215        'GB': {'n_estimators': [10, 100], 'learning_rate': [0.001, 0.1, 0.5], 'subsample': [0.1, 0.5, 1.0],216               'max_depth': [5, 50]},217        'NB': {},218        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'min_samples_split': [2, 5, 10]},219        'SVM': {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear']},220        'KNN': {'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'],221                'algorithm': ['auto', 'ball_tree', 'kd_tree']}222    }223    test_grid = {224        'RF': {'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'], 'min_samples_split': [10]},225        'LR': {'penalty': ['l1'], 'C': [0.01], 'solver': ['liblinear']},226        'SGD': {'loss': ['perceptron'], 'penalty': ['l2']},227        'ET': {'n_estimators': [1], 'criterion': ['gini'], 'max_depth': [1], 'max_features': ['sqrt'],228               'min_samples_split': [10]},229        'AB': {'algorithm': ['SAMME'], 'n_estimators': [1]},230        'GB': {'n_estimators': [1], 'learning_rate': [0.1], 'subsample': [0.5], 'max_depth': [1]},231        'NB': {},232        'DT': {'criterion': ['gini'], 'max_depth': [1], 'min_samples_split': [10]},233        'SVM': {'C': [1], 'kernel': ['linear']},234        'KNN': {'n_neighbors': [5], 'weights': ['uniform'], 'algorithm': ['auto']}235    }236    if grid_size == 'large':237        return clfs, large_grid238    elif grid_size == 'small':239        return clfs, small_grid240    elif grid_size == 'test':241        return clfs, test_grid242    else:243        return 0, 0244def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test):245    """Runs the loop using models_to_run, clfs, gridm and the data246    """247    results_df = pd.DataFrame(columns=('model_type', 'clf', 'parameters', 'auc-roc', 'r_at_5', 'r_at_10', 'r_at_20',248                                       'r_at_30', 'r_at_35', 'r_at_40', 'p_at_5', 'p_at_10', 'p_at_20', 'p_at_30',249                                       'p_at_35', 'p_at_40'))250    for n in range(1, 2):251        for index, clf in enumerate([clfs[x] for x in models_to_run]):252            print(models_to_run[index])253            parameter_values = grid[models_to_run[index]]254            for p in ParameterGrid(parameter_values):255                try:256                    clf.set_params(**p)257                    y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]258                    # you can also store the model, feature importances, and prediction scores259                    # we're only storing the metrics for now260                    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))261                    results_df.loc[len(results_df)] = [models_to_run[index], clf, p,262                                                       roc_auc_score(y_test, y_pred_probs),263                                                       recall_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),264                                                       recall_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),265                                                       recall_at_k(y_test_sorted, y_pred_probs_sorted, 20.0),266                                                       recall_at_k(y_test_sorted, y_pred_probs_sorted, 30.0),267                                                       recall_at_k(y_test_sorted, y_pred_probs_sorted, 35.0),268                                                       recall_at_k(y_test_sorted, y_pred_probs_sorted, 40.0),269                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),270                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),271                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 20.0),272                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 30.0),273                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 35.0),274                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 40.0)]275                except IndexError as e:276                    print('Error:', e)277                    continue278    return results_df279# Create training and test set: code taken from Aurelien Geron's github https://github.com/ageron/handson-ml280def test_set_check(identifier, test_ratio, hash):281    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio282def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):283    ids = data[id_column]284    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))285    return data.loc[~in_test_set], data.loc[in_test_set]286def main():287    df = pd.read_csv('./data/ECommerceDataSet.csv')288    # Remove outliers289    df = df.loc[df['SessionRevenue'] != 500000]290    # Apply under sampling291    df = under_sampling(df, 'CVR')292    # Find the best model293    find_best_model(df, 'small', 'CVR', 'ml_results_after_undersampling.csv')294    # Fit the best model, in this case, random forest model295    fit_random_forest(df, 'CVR')296    # Find AUC for 1-feature decision tree as the baseline results297    baseline_model(df, 'CVR')298if __name__ == "__main__":...project1.py
Source:project1.py  
1# -*- coding: utf-8 -*-2"""Copy of Copy 44 Untitled1.ipynb3Automatically generated by Colaboratory.4Original file is located at5    https://colab.research.google.com/drive/1UsAVBsFGTXnNG0vWps5m_3Cht6QQqZeG6# **[call all the basic libraries:]**7"""8import pandas as pd9import numpy as np10import matplotlib.pyplot as plt11import seaborn as sns12from sklearn.preprocessing import LabelEncoder , OneHotEncoder13from sklearn.pipeline import Pipeline14from sklearn.preprocessing import StandardScaler15from sklearn.compose import ColumnTransformer16from sklearn.linear_model import LinearRegression17from sklearn.tree import DecisionTreeRegressor18from sklearn.ensemble import RandomForestRegressor19from sklearn.neighbors import KNeighborsRegressor20from xgboost import XGBRegressor21from sklearn.model_selection import train_test_split22from pandas.plotting import scatter_matrix23from sklearn.impute import SimpleImputer24from sklearn.base import BaseEstimator, TransformerMixin25from sklearn.model_selection import cross_val_score26from sklearn.metrics import mean_squared_error27from sklearn.model_selection import GridSearchCV28"""# **start reading the training and testing data sets:**"""29train_df=pd.read_csv('/content/drive/MyDrive/train.csv')30train_df.head()31train_df.info()32dim=pd.read_csv('/content/drive/MyDrive/train.csv')33dim.info()34test_df=pd.read_csv('/content/drive/MyDrive/test.csv')35test_df.head()36test_df.info()37mean_d = dim['price'].mean()38print("Mean Value of Diamonds: $", mean_d)39dim.describe()40dim.describe(include='object')41dim.nunique()42"""# 3. **visualize** the data"""43dim.hist(figsize=(18,10))44sns.pairplot(dim, y_vars='price')45sns.pairplot(dim)46dim["cut"].value_counts() / len(dim)47corr_matrix = dim.corr()48corr_matrix49corr_matrix['price'].sort_values(ascending=False)50plt.figure(figsize = (12,8))51corr_matrix['price'].sort_values(ascending = False).plot(kind = 'bar')52plt.figure(figsize = (16,5))53heato=sns.heatmap(corr_matrix ,cmap='BrBG' ,annot=True )54heato.set_title('Correlation Heatmap', fontdict={'fontsize':25})55dim.plot.scatter(x='carat', y='price' ,figsize=(10,5))56dim.plot.scatter(x='z', y='price' ,figsize=(10,5))57d.plot.scatter(x='z', y='price' ,figsize=(10,5))58input_cat_columns = dim.select_dtypes(include = ['object']).columns.to_list()59for col in input_cat_columns:60    sns.catplot(x=col, y="price", kind="box", dodge=False, height = 5, aspect = 3,data=dim);61"""# **Removing the outliers:**62"""63Q1=dim['depth'].quantile(0.25)64Q3=dim['depth'].quantile(0.75)65IQR=Q3-Q166idx=~((dim['depth']<(Q1 - 1.5*IQR)) | (dim['depth'] >(Q3 + 1.5*IQR)))67d1=dim[idx]68d1.info()69Q1x=dim['x'].quantile(0.25)70Q3x=dim['x'].quantile(0.75)71IQRx=Q3x-Q1x72idxx=(d1['x']>(Q1x - 1.5*IQRx)) & (d1['x'] <(Q3x + 1.5*IQRx))73dx=d1[idxx]74dx.info()75Q1y=dim['y'].quantile(0.25)76Q3y=dim['y'].quantile(0.75)77IQRy=Q3y-Q1y78idxy=(dx['x']>(Q1y - 1.5*IQRy)) & (dx['x'] <(Q3y + 1.5*IQRy))79dy=dx[idxy]80dy.info()81Q1z=dim['z'].quantile(0.25)82Q3z=dim['z'].quantile(0.75)83IQRz=Q3z-Q1z84idxz=(dy['z']>(Q1z - 1.5*IQRz)) & (dy['z'] <(Q3z + 1.5*IQRz))85dz=dy[idxz]86dz.describe()87# dz.info()88Q1ca=dim['carat'].quantile(0.25)89Q3ca=dim['carat'].quantile(0.75)90IQRca=Q3-Q191idxca=(dz['carat']>(Q1ca - 1.5*IQRca)) & (dz['x'] <(Q3ca + 1.5*IQRca))92dca=dz[idx]93dca.info()94Q1ta=dim['table'].quantile(0.25)95Q3ta=dim['table'].quantile(0.75)96IQRta=Q3ta-Q1ta97idxta=(dca['table']>(Q1ta - 1.5*IQRta)) & (dca['x'] <(Q3ta + 1.5*IQRta))98d=dca[idxta]99d.info()100dix=dim.drop('price',axis=1)101diy=dim['price']102x_train, x_test , y_train , y_test = train_test_split(dix, diy, test_size=0.25 , random_state=42)103# def prepare_data(df):104#   num_attribs=df.select_dtypes(include=[np.number]).columns.to_list()105#   num_pipeline = Pipeline([('std_scaler', StandardScaler())])106#   cat_attribs = ["color","clarity","cut"]107#   full_pipeline = ColumnTransformer([108#   ("num", num_pipeline, num_attribs),109#   ("cat", OneHotEncoder(), cat_attribs),110#    ])111#   data_prepared = full_pipeline.fit_transform(df)112num_attribs=x_train.select_dtypes(include=[np.number]).columns.to_list()113num_pipeline = Pipeline([('std_scaler', StandardScaler())])114cat_attribs = ["color","clarity","cut"]115full_pipeline = ColumnTransformer([116("num", num_pipeline, num_attribs),117("cat", OneHotEncoder(), cat_attribs),118])119train_prepared = full_pipeline.fit_transform(x_train)120test_prepared= full_pipeline.fit_transform(x_test)121# full_pipeline = ColumnTransformer([122# ("num", num_pipeline, num_attribs),123# ("cat", OneHotEncoder(), cat_attribs),124# ])125# cat_cols= d.select_dtypes(include='object').columns.to_list()126# dim1=pd.get_dummies(d , columns=cat_cols , drop_first=True)127# x=dim1.drop('price', axis=1)128# y=dim1['price']129# x_train, x_test , y_train , y_test = train_test_split(x,y, test_size=0.3 , random_state=42)130"""# **start Selecting and Training some Models**131**1. LinearRegression model**132"""133from sklearn.linear_model import LinearRegression134lin_reg = LinearRegression()135lin_reg.fit(train_prepared, y_train)136# some_data_prepared = full_pipeline.transform(some_data)137# print("Predictions:", lin_reg.predict(some_data_prepared))138# print("Labels:", list(some_labels))139dim1_predictions = lin_reg.predict(test_prepared)140lin_mse = mean_squared_error(y_test, dim1_predictions)141lin_rmse = np.sqrt(lin_mse)142lin_rmse143"""*** Using Cross-Validation***"""144lin_scores = cross_val_score(lin_reg, train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)145lin_rmse_scores = np.sqrt(-lin_scores)146print("Scores: ", lin_rmse_scores)147print("Mean: ", lin_rmse_scores.mean())148print("Standard Deviation: ", lin_rmse_scores.std())149"""**2. Decision Tree Regressor model**150"""151tree_reg = DecisionTreeRegressor()152tree_reg.fit(train_prepared, y_train)153dimtree_predictions = tree_reg.predict(test_prepared)154tree_mse = mean_squared_error(y_test, dimtree_predictions)155tree_rmse = np.sqrt(tree_mse)156tree_rmse157"""*** Using Cross-Validation***"""158scores = cross_val_score(tree_reg, train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)159tree_rmse_scores = np.sqrt(-scores)160print("Scores: ", tree_rmse_scores)161print("Mean: ", tree_rmse_scores.mean())162print("Standard Deviation: ", tree_rmse_scores.std())163"""**3. RandomForest Model:**"""164rand_for= RandomForestRegressor()165rand_for.fit(train_prepared, y_train)166ranfor_predictions = tree_reg.predict(test_prepared)167tree_mse = mean_squared_error(y_test, ranfor_predictions)168tree_rmse = np.sqrt(tree_mse)169tree_rmse170"""## Evaluation Models Using Cross-Validation171"""172forest_scores = cross_val_score(rand_for ,train_prepared, y_train,scoring = "neg_mean_squared_error", cv = 10)173forest_rmse_scores = np.sqrt(-forest_scores)174print("Scores: ", forest_rmse_scores)175print("Mean: ", forest_rmse_scores.mean())176print("Standard Deviation: ", forest_rmse_scores.std())177final_test=full_pipeline.fit_transform(test_df)178predictions=pd.Series(rand_for.predict(test_prepared))179pred=pd.DataFrame({'Id': test_df['Id'], 'price': predictions})180pred181"""# **Final Tune using Grid Search:**"""182from sklearn.model_selection import GridSearchCV183param_grid = [184              {'n_estimators': [3,10,30], 'max_features':[2,4,6,8]},185              {'bootstrap':[False], 'max_features':[2,3,4],'n_estimators':[3,10]}186]187fore_reg = RandomForestRegressor(random_state = 42)188grid_search = GridSearchCV(fore_reg, param_grid,cv = 5, scoring = 'neg_mean_squared_error',return_train_score = True)189grid_search.fit(train_prepared, y_train)190cvres = grid_search.cv_results_191for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):192     print(np.sqrt(-mean_score), params)193feature_importances = grid_search.best_estimator_.feature_importances_194feature_importances195final_model = grid_search.best_estimator_196final_model197final_predictions = final_model.predict(test_prepared)198final_mse = mean_squared_error(y_test, final_predictions)199final_rmse = np.sqrt(final_mse)200final_rmse201final_predictions=pd.Series(rand_for.predict(test_prepared))...__init__.py
Source:__init__.py  
1from unittest import TestSuite2import test_doctests, test_prepared, test_equality, test_geomseq, test_xy3import test_collection, test_emptiness, test_singularity, test_validation4def test_suite():5    suite = TestSuite()6    suite.addTest(test_doctests.test_suite())7    suite.addTest(test_prepared.test_suite())8    suite.addTest(test_emptiness.test_suite())9    suite.addTest(test_equality.test_suite())10    suite.addTest(test_geomseq.test_suite())11    suite.addTest(test_xy.test_suite())12    suite.addTest(test_collection.test_suite())13    suite.addTest(test_singularity.test_suite())14    suite.addTest(test_validation.test_suite())...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
