Best Python code snippet using avocado_python
data_split.py
Source:data_split.py  
1# Note: use to preprocess simulated or real HSC galaxies2# We will split the simulation dataset into five subsets:3# 1.training set for GaMorNet 4# 2.validation set for GaMorNet 5# 3.training set for PSFGAN 6# 4.validation set for PSFGAN 7# 5.common test set for GaMorNet + PSFGAN 8# Modified from "sim_gal_preprocess.py"9import argparse10import os11import glob12import pandas13import numpy as np14import random15from astropy.io import fits16# Paths17core_path = '/gpfs/loomis/project/urry/ct564/HSC/PSFGAN/'18galaxy_main = core_path + 'dimauro_0.5_1.0/'19# Other parameters20# Order doesn't matter (e.g. ['g', 'r'] is the same as ['r', 'g'])21filter_strings = ['g', 'r', 'i', 'z', 'y']22# flux conversion parameters 23# i.e. [flux in nanoJy] * nJy_to_adu_per_AA = [flux in adu]24# HSC uses nanoJy; GalSim uses adu25# HSCWide-G: radius=4.1m, exp_time=10min, quantum efficiency=0.864, gain=3.0, lambda effctive=4754 Angstrom26# HSCWide-R: radius=4.1m, exp_time=10min, quantum efficiency=0.956, gain=3.0, lambda effctive=6175 Angstrom27# HSCWide-I: radius=4.1m, exp_time=20min, quantum efficiency=0.882, gain=3.0, lambda effctive=7711 Angstrom28# HSCWide-Z: radius=4.1m, exp_time=20min, quantum efficiency=0.821, gain=3.0, lambda effctive=8898 Angstrom29# HSCWide-Y: radius=4.1m, exp_time=20min, quantum efficiency=0.517, gain=3.0, lambda effctive=9762 Angstrom30#nJy_to_adu_per_AA_filters = [0.0289698414, 0.0246781434, 0.0364652697, 0.0294152337, 0.0168839201]31nJy_to_adu_per_AA_filters = [0.0289698414, 0.0246781434, 0.0364652697, 0.0294152337, 0.0168839201]32    33# The desired image shape. Images of other shapes will not pass the selection (thus be filtered out)34desired_shape = [239, 239]35parser = argparse.ArgumentParser()36def data_split():37    # Make the split predictable38    np.random.seed(42)39    parser.add_argument("--gmn_train", default=6300)40    parser.add_argument("--gmn_eval", default=700)41    parser.add_argument("--psf_train", default=4500)42    parser.add_argument("--psf_eval", default=500)43    parser.add_argument("--test", default=1528)44    parser.add_argument("--shuffle", default="1")45    # Identify source of the raw data. This will determine the names of columns in catalogs being created46    # Options: "sim_hsc_0_0.25", "simard_cross_hsc"...(more to be added)47    parser.add_argument("--source", default="dimauro_0.5_1.0")48    parser.add_argument("--split", default="unequal")49    args = parser.parse_args()50    gmn_train = int(args.gmn_train)51    gmn_eval = int(args.gmn_eval)52    psf_train = int(args.psf_train)53    psf_eval = int(args.psf_eval)54    test = int(args.test)55    shuffle = bool(int(args.shuffle))56    source = str(args.source)57    split = str(args.split)58    num_filters = len(filter_strings)59    num_total = 060    num_gmn_train = 061    num_gmn_eval = 062    num_psf_train = 063    num_psf_eval = 064    num_test = 065    num_resized = 066    num_correctly_resized = 067    num_negative_flux = 068    # Input and output locations69    hsc_folders = []70    hsc_catalogs = []71    gmn_train_folders = []72    gmn_eval_folders = []73    psf_train_folders = []74    psf_eval_folders = []75    test_folders = []76    gmn_train_catalogs = []77    gmn_eval_catalogs = []78    psf_train_catalogs = []79    psf_eval_catalogs = []80    test_catalogs = []81    for filter_string in filter_strings:82        galaxy_per_filter = galaxy_main + filter_string + '-band/'83        hsc_folder = glob.glob(galaxy_per_filter + 'raw_data/images/')[0]84        hsc_catalog = pandas.read_csv(glob.glob(galaxy_per_filter + 'raw_data/*.csv')[0])85        gmn_train_folder = galaxy_per_filter + 'gmn_train/'86        gmn_eval_folder = galaxy_per_filter + 'gmn_eval/'87        psf_train_folder = galaxy_per_filter + 'fits_train/'88        psf_eval_folder = galaxy_per_filter + 'fits_eval/'89        test_folder = galaxy_per_filter + 'fits_test/'90        if not os.path.exists(gmn_train_folder):91            os.makedirs(gmn_train_folder)92        if not os.path.exists(gmn_eval_folder):93            os.makedirs(gmn_eval_folder)94        if not os.path.exists(psf_train_folder):95            os.makedirs(psf_train_folder)96        if not os.path.exists(psf_eval_folder):97            os.makedirs(psf_eval_folder)98        if not os.path.exists(test_folder):99            os.makedirs(test_folder)100        if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):101            column_list = ['object_id', 'num_components', 'sersic_idx_d', 'R_e_d', 'axis_ratio_d', 'PA_d', 'flux_frac_d',102                          'sersic_idx_b', 'R_e_b', 'axis_ratio_b', 'PA_b', 'flux_frac_b',103                          filter_string + '_total_flux']104            gmn_train_catalog = pandas.DataFrame(columns=column_list)105            gmn_eval_catalog = pandas.DataFrame(columns=column_list)106            psf_train_catalog = pandas.DataFrame(columns=column_list)107            psf_eval_catalog = pandas.DataFrame(columns=column_list)108            test_catalog = pandas.DataFrame(columns=column_list)109        elif source == "simard":110            column_list = ['object_id', 'ra', 'dec', 'photoz_best', 'SClass', 'z', 'Scale', 'Rhlg', 'Rhlr', 'Rchl,g', 'Rchl,r',111                          '(B/T)g', 'e(B/T)g', '(B/T)r', 'e(B/T)r',112                          filter_string + '_total_flux']113            gmn_train_catalog = pandas.DataFrame(columns=column_list)114            gmn_eval_catalog = pandas.DataFrame(columns=column_list)115            psf_train_catalog = pandas.DataFrame(columns=column_list)116            psf_eval_catalog = pandas.DataFrame(columns=column_list)117            test_catalog = pandas.DataFrame(columns=column_list)118        elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):119            column_list = ['object_id', 'ra', 'dec', 'photoz_best', 'RE_F606', 'RE_F814', 'RE_F125', 'RE_F160',120                          'N_F606', 'N_F814', 'N_F125', 'N_F160', 'B_T_m',121                          filter_string + '_total_flux']122            gmn_train_catalog = pandas.DataFrame(columns=column_list)123            gmn_eval_catalog = pandas.DataFrame(columns=column_list)124            psf_train_catalog = pandas.DataFrame(columns=column_list)125            psf_eval_catalog = pandas.DataFrame(columns=column_list)126            test_catalog = pandas.DataFrame(columns=column_list)127        hsc_folders.append(hsc_folder)128        hsc_catalogs.append(hsc_catalog)129        gmn_train_folders.append(gmn_train_folder)130        gmn_eval_folders.append(gmn_eval_folder)131        psf_train_folders.append(psf_train_folder)132        psf_eval_folders.append(psf_eval_folder)133        test_folders.append(test_folder)134        gmn_train_catalogs.append(gmn_train_catalog)135        gmn_eval_catalogs.append(gmn_eval_catalog)136        psf_train_catalogs.append(psf_train_catalog)137        psf_eval_catalogs.append(psf_eval_catalog)138        test_catalogs.append(test_catalog)139    # Main loop140    # Start the loop by iterating over the row number based on the first catalog from hsc_catalogs141    row_num_list = list(range(2, len(hsc_catalogs[0]) + 2))142    143    # Equal or unequal data split144    # When using "unequal" split, please make sure "hsc_catalogs[0]" is already labeled.145    if split == "equal":146        if shuffle:147            np.random.shuffle(row_num_list)148            149    elif split == "unequal":150        # Get the bulge list first151        bulge_list = list(hsc_catalogs[0]["is_bulge"])152        num_bulges = np.sum(bulge_list)153        num_non_bulges = len(hsc_catalogs[0]) - num_bulges154        # Then sort "row_num_list" according to "bulge_list" (bulges will be sorted to the bottom)155        row_num_list = [x for _, x in sorted(zip(bulge_list, row_num_list))]156        157        # If shuffle is True:158        # First shuffle subset of bulges and subset of nonbulges159        if shuffle:160            non_bulge_row_num_list = row_num_list[:num_non_bulges]161            bulge_row_num_list = row_num_list[num_non_bulges:]162            np.random.shuffle(non_bulge_row_num_list)163            np.random.shuffle(bulge_row_num_list)164            row_num_list = non_bulge_row_num_list + bulge_row_num_list165        166        # Next shuffle subset of psf_train&psf_eval and subset of gmn_train&gmn_eval&test167        if shuffle:168            psf_row_num_list = row_num_list[:(psf_train+psf_eval)]169            gmn_test_row_num_list = row_num_list[(psf_train+psf_eval):]170            np.random.shuffle(psf_row_num_list)171            np.random.shuffle(gmn_test_row_num_list)172            row_num_list = psf_row_num_list + gmn_test_row_num_list173    174    for row_num in row_num_list:175        if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):176            obj_id = int(row_num - 2)177        elif (source == "simard") or (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):178            obj_id = int(row_num)179        # Read the images180        images = []181        for i in range(num_filters):182            if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):183                fits_path = '%s/%s.fits' % (hsc_folders[i], obj_id)184            elif (source == "simard") or (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):185                fits_path = '%s/%s-cutout-*.fits' % (hsc_folders[i], obj_id)186            file = glob.glob(fits_path)[0]187            image = fits.getdata(file)188            images.append(image)189        # Check whether the flux is positive in each filter190        # If not, quit the loop191        positive_flux_booleans = []192        for i in range(num_filters):193            current_row = hsc_catalogs[i].iloc[row_num-2]194            if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):195                total_flux = current_row['total_flux']196            elif (source == "simard") or (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):197                total_flux = current_row[filter_strings[i] + '_cmodel_flux']198            if total_flux < 0:199                positive_flux_boolean = False200            else:201                positive_flux_boolean = True202            positive_flux_booleans.append(positive_flux_boolean)203        if False in positive_flux_booleans:204            num_negative_flux += 1205            continue206            207        # Check whether the images have desired shapes in each filter208        # If not, resize the image209        desired_shape_booleans = []210        for i in range(num_filters):211            current_shape = list(images[i].shape)212            if not (current_shape[0] == desired_shape[0] and current_shape[1] == desired_shape[1]):213                desired_shape_boolean = False214                215                # Start to resize the first dimension216                if current_shape[0] < desired_shape[0]:217                    if (desired_shape[0]-current_shape[0]) % 2 == 0:218                        images[i] = np.pad(images[i], (( (desired_shape[0]-current_shape[0])//2, (desired_shape[0]-current_shape[0])//2 ), (0, 0)), 'reflect')219                    else: # (desired_shape[0] - current_shape[0]) % 2 == 1:220                        images[i] = np.pad(images[i], (( (desired_shape[0]-current_shape[0])//2, (desired_shape[0]-current_shape[0])//2 + 1), (0, 0)), 'reflect')221                elif current_shape[0] > desired_shape[0]:222                    if (current_shape[0]-desired_shape[0]) % 2 == 0:223                        images[i] = images[i][(current_shape[0]-desired_shape[0])//2 : -((current_shape[0]-desired_shape[0])//2), :]224                    else: # (current_shape[0]-desired_shape[0]) % 2 == 1:225                        images[i] = images[i][(current_shape[0]-desired_shape[0])//2: -((current_shape[0]-desired_shape[0])//2 + 1), :]226                # Then resize the second dimension227                if current_shape[1] < desired_shape[1]:228                    if (desired_shape[1]-current_shape[1]) % 2 == 0:229                        images[i] = np.pad(images[i], ((0, 0), ( (desired_shape[1]-current_shape[1])//2, (desired_shape[1]-current_shape[1])//2 )), 'reflect')230                    else: # (desired_shape[1]-current_shape[1]) % 2 == 1:231                        images[i] = np.pad(images[i], ((0, 0), ( (desired_shape[1]-current_shape[1])//2, (desired_shape[1]-current_shape[1])//2 + 1)), 'reflect')232                elif current_shape[1] > desired_shape[1]:233                    if (current_shape[1]-desired_shape[1]) % 2 == 0:234                        images[i] = images[i][:, (current_shape[1]-desired_shape[1])//2 : -((current_shape[1]-desired_shape[1])//2)]235                    else: # (current_shape[1]-desired_shape[1]) % 2 == 1:236                        images[i] = images[i][:, (current_shape[1]-desired_shape[1])//2: -((current_shape[1]-desired_shape[1])//2 + 1)]237                238            else:239                desired_shape_boolean = True240            desired_shape_booleans.append(desired_shape_boolean)241        if False in desired_shape_booleans:242            num_resized += 1243        244        # Check if each galaxy has been correctly resized245        if False in desired_shape_booleans:246            correctly_resized_booleans = []247            for i in range(num_filters):248                current_shape = list(images[i].shape)249                if not (current_shape[0] == desired_shape[0] and current_shape[1] == desired_shape[1]):250                    correctly_resized_boolean = False251                else:252                    correctly_resized_boolean = True253                correctly_resized_booleans.append(correctly_resized_boolean)254            if False not in correctly_resized_booleans:255                num_correctly_resized += 1256        257        # Otherwise, let's proceed258        if num_psf_train < psf_train:259            for i in range(num_filters):260                # Save the image261                image_name = psf_train_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'262                hdu = fits.PrimaryHDU(images[i])263                hdu.writeto(image_name, overwrite=True)264                # Also, create a row for this image in the new catalog265                current_row = hsc_catalogs[i].iloc[row_num-2]266                if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):267                    psf_train_catalogs[i] = psf_train_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],268                    'sersic_idx_d': current_row['sersic_idx_d'],269                    'R_e_d': current_row['R_e_d'],270                    'axis_ratio_d': current_row['axis_ratio_d'],271                    'PA_d': current_row['PA_d'],272                    'flux_frac_d': current_row['flux_frac_d'],273                    'sersic_idx_b': current_row['sersic_idx_b'],274                    'R_e_b': current_row['R_e_b'],275                    'axis_ratio_b': current_row['axis_ratio_b'],276                    'PA_b': current_row['PA_b'],277                    'flux_frac_b': current_row['flux_frac_b'],278                    filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)279                elif source == "simard":280                    psf_train_catalogs[i] = psf_train_catalogs[i].append({'object_id': obj_id,281                    'ra': current_row['ra'],282                    'dec': current_row['dec'],283                    'photoz_best': current_row['photoz_best'],284                    'SClass': current_row['SClass'],285                    'z': current_row['z'],286                    'Scale': current_row['Scale'],287                    'Rhlg': current_row['Rhlg'],288                    'Rhlr': current_row['Rhlr'],289                    'Rchl,g': current_row['Rchl,g'],290                    'Rchl,r': current_row['Rchl,r'],291                    '(B/T)g': current_row['(B/T)g'],292                    'e(B/T)g': current_row['e(B/T)g'],293                    '(B/T)r': current_row['(B/T)r'],294                    'e(B/T)r': current_row['e(B/T)r'],295                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)296                elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):297                    psf_train_catalogs[i] = psf_train_catalogs[i].append({'object_id': obj_id,298                    'ra': current_row['ra'],299                    'dec': current_row['dec'],300                    'photoz_best': current_row['photoz_best'],301                    'RE_F606': current_row['RE_F606'],302                    'RE_F814': current_row['RE_F814'],303                    'RE_F125': current_row['RE_F125'],304                    'RE_F160': current_row['RE_F160'],305                    'N_F606': current_row['N_F606'],306                    'N_F814': current_row['N_F814'],307                    'N_F125': current_row['N_F125'],308                    'N_F160': current_row['N_F160'],309                    'B_T_m': current_row['B_T_m'],310                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)311            num_psf_train += 1312            num_total += 1313            continue314        if num_psf_eval < psf_eval:315            for i in range(num_filters):316                # Save the image317                image_name = psf_eval_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'318                hdu = fits.PrimaryHDU(images[i])319                hdu.writeto(image_name, overwrite=True)320                # Also, create a row for this image in the new catalog321                current_row = hsc_catalogs[i].iloc[row_num-2]322                if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):323                    psf_eval_catalogs[i] = psf_eval_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],324                    'sersic_idx_d': current_row['sersic_idx_d'],325                    'R_e_d': current_row['R_e_d'],326                    'axis_ratio_d': current_row['axis_ratio_d'],327                    'PA_d': current_row['PA_d'],328                    'flux_frac_d': current_row['flux_frac_d'],329                    'sersic_idx_b': current_row['sersic_idx_b'],330                    'R_e_b': current_row['R_e_b'],331                    'axis_ratio_b': current_row['axis_ratio_b'],332                    'PA_b': current_row['PA_b'],333                    'flux_frac_b': current_row['flux_frac_b'],334                    filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)335                elif source == "simard":336                    psf_eval_catalogs[i] = psf_eval_catalogs[i].append({'object_id': obj_id,337                    'ra': current_row['ra'],338                    'dec': current_row['dec'],339                    'photoz_best': current_row['photoz_best'],340                    'SClass': current_row['SClass'],341                    'z': current_row['z'],342                    'Scale': current_row['Scale'],343                    'Rhlg': current_row['Rhlg'],344                    'Rhlr': current_row['Rhlr'],345                    'Rchl,g': current_row['Rchl,g'],346                    'Rchl,r': current_row['Rchl,r'],347                    '(B/T)g': current_row['(B/T)g'],348                    'e(B/T)g': current_row['e(B/T)g'],349                    '(B/T)r': current_row['(B/T)r'],350                    'e(B/T)r': current_row['e(B/T)r'],351                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)352                elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):353                    psf_eval_catalogs[i] = psf_eval_catalogs[i].append({'object_id': obj_id,354                    'ra': current_row['ra'],355                    'dec': current_row['dec'],356                    'photoz_best': current_row['photoz_best'],357                    'RE_F606': current_row['RE_F606'],358                    'RE_F814': current_row['RE_F814'],359                    'RE_F125': current_row['RE_F125'],360                    'RE_F160': current_row['RE_F160'],361                    'N_F606': current_row['N_F606'],362                    'N_F814': current_row['N_F814'],363                    'N_F125': current_row['N_F125'],364                    'N_F160': current_row['N_F160'],365                    'B_T_m': current_row['B_T_m'],366                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)367            num_psf_eval += 1368            num_total += 1369            continue370        if num_gmn_train < gmn_train:371            for i in range(num_filters):372                # Save the image373                image_name = gmn_train_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'374                hdu = fits.PrimaryHDU(images[i])375                hdu.writeto(image_name, overwrite=True)376                # Also, create a row for this image in the new catalog377                current_row = hsc_catalogs[i].iloc[row_num-2]378                if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):379                    gmn_train_catalogs[i] = gmn_train_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],380                    'sersic_idx_d': current_row['sersic_idx_d'],381                    'R_e_d': current_row['R_e_d'],382                    'axis_ratio_d': current_row['axis_ratio_d'],383                    'PA_d': current_row['PA_d'],384                    'flux_frac_d': current_row['flux_frac_d'],385                    'sersic_idx_b': current_row['sersic_idx_b'],386                    'R_e_b': current_row['R_e_b'],387                    'axis_ratio_b': current_row['axis_ratio_b'],388                    'PA_b': current_row['PA_b'],389                    'flux_frac_b': current_row['flux_frac_b'],390                    filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)391                elif source == "simard":392                    gmn_train_catalogs[i] = gmn_train_catalogs[i].append({'object_id': obj_id,393                    'ra': current_row['ra'],394                    'dec': current_row['dec'],395                    'photoz_best': current_row['photoz_best'],396                    'SClass': current_row['SClass'],397                    'z': current_row['z'],398                    'Scale': current_row['Scale'],399                    'Rhlg': current_row['Rhlg'],400                    'Rhlr': current_row['Rhlr'],401                    'Rchl,g': current_row['Rchl,g'],402                    'Rchl,r': current_row['Rchl,r'],403                    '(B/T)g': current_row['(B/T)g'],404                    'e(B/T)g': current_row['e(B/T)g'],405                    '(B/T)r': current_row['(B/T)r'],406                    'e(B/T)r': current_row['e(B/T)r'],407                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True) 408                elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):409                    gmn_train_catalogs[i] = gmn_train_catalogs[i].append({'object_id': obj_id,410                    'ra': current_row['ra'],411                    'dec': current_row['dec'],412                    'photoz_best': current_row['photoz_best'],413                    'RE_F606': current_row['RE_F606'],414                    'RE_F814': current_row['RE_F814'],415                    'RE_F125': current_row['RE_F125'],416                    'RE_F160': current_row['RE_F160'],417                    'N_F606': current_row['N_F606'],418                    'N_F814': current_row['N_F814'],419                    'N_F125': current_row['N_F125'],420                    'N_F160': current_row['N_F160'],421                    'B_T_m': current_row['B_T_m'],422                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)   423            num_gmn_train += 1424            num_total += 1425            continue426        if num_gmn_eval < gmn_eval:427            for i in range(num_filters):428                # Save the image429                image_name = gmn_eval_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'430                hdu = fits.PrimaryHDU(images[i])431                hdu.writeto(image_name, overwrite=True)432                # Also, create a row for this image in the new catalog433                current_row = hsc_catalogs[i].iloc[row_num-2]434                if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):435                    gmn_eval_catalogs[i] = gmn_eval_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],436                    'sersic_idx_d': current_row['sersic_idx_d'],437                    'R_e_d': current_row['R_e_d'],438                    'axis_ratio_d': current_row['axis_ratio_d'],439                    'PA_d': current_row['PA_d'],440                    'flux_frac_d': current_row['flux_frac_d'],441                    'sersic_idx_b': current_row['sersic_idx_b'],442                    'R_e_b': current_row['R_e_b'],443                    'axis_ratio_b': current_row['axis_ratio_b'],444                    'PA_b': current_row['PA_b'],445                    'flux_frac_b': current_row['flux_frac_b'],446                    filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)447                elif source == "simard":448                    gmn_eval_catalogs[i] = gmn_eval_catalogs[i].append({'object_id': obj_id,449                    'ra': current_row['ra'],450                    'dec': current_row['dec'],451                    'photoz_best': current_row['photoz_best'],452                    'SClass': current_row['SClass'],453                    'z': current_row['z'],454                    'Scale': current_row['Scale'],455                    'Rhlg': current_row['Rhlg'],456                    'Rhlr': current_row['Rhlr'],457                    'Rchl,g': current_row['Rchl,g'],458                    'Rchl,r': current_row['Rchl,r'],459                    '(B/T)g': current_row['(B/T)g'],460                    'e(B/T)g': current_row['e(B/T)g'],461                    '(B/T)r': current_row['(B/T)r'],462                    'e(B/T)r': current_row['e(B/T)r'],463                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)   464                elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):465                    gmn_eval_catalogs[i] = gmn_eval_catalogs[i].append({'object_id': obj_id,466                    'ra': current_row['ra'],467                    'dec': current_row['dec'],468                    'photoz_best': current_row['photoz_best'],469                    'RE_F606': current_row['RE_F606'],470                    'RE_F814': current_row['RE_F814'],471                    'RE_F125': current_row['RE_F125'],472                    'RE_F160': current_row['RE_F160'],473                    'N_F606': current_row['N_F606'],474                    'N_F814': current_row['N_F814'],475                    'N_F125': current_row['N_F125'],476                    'N_F160': current_row['N_F160'],477                    'B_T_m': current_row['B_T_m'],478                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)479            num_gmn_eval += 1480            num_total += 1481            continue482        if num_test < test:483            for i in range(num_filters):484                # Save the image485                image_name = test_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'486                hdu = fits.PrimaryHDU(images[i])487                hdu.writeto(image_name, overwrite=True)488                # Also, create a row for this image in the new catalog489                current_row = hsc_catalogs[i].iloc[row_num-2]490                if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):491                    test_catalogs[i] = test_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],492                    'sersic_idx_d': current_row['sersic_idx_d'],493                    'R_e_d': current_row['R_e_d'],494                    'axis_ratio_d': current_row['axis_ratio_d'],495                    'PA_d': current_row['PA_d'],496                    'flux_frac_d': current_row['flux_frac_d'],497                    'sersic_idx_b': current_row['sersic_idx_b'],498                    'R_e_b': current_row['R_e_b'],499                    'axis_ratio_b': current_row['axis_ratio_b'],500                    'PA_b': current_row['PA_b'],501                    'flux_frac_b': current_row['flux_frac_b'],502                    filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)503                elif source == "simard":504                    test_catalogs[i] = test_catalogs[i].append({'object_id': obj_id,505                    'ra': current_row['ra'],506                    'dec': current_row['dec'],507                    'photoz_best': current_row['photoz_best'],508                    'SClass': current_row['SClass'],509                    'z': current_row['z'],510                    'Scale': current_row['Scale'],511                    'Rhlg': current_row['Rhlg'],512                    'Rhlr': current_row['Rhlr'],513                    'Rchl,g': current_row['Rchl,g'],514                    'Rchl,r': current_row['Rchl,r'],515                    '(B/T)g': current_row['(B/T)g'],516                    'e(B/T)g': current_row['e(B/T)g'],517                    '(B/T)r': current_row['(B/T)r'],518                    'e(B/T)r': current_row['e(B/T)r'],519                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)520                elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):521                    test_catalogs[i] = test_catalogs[i].append({'object_id': obj_id,522                    'ra': current_row['ra'],523                    'dec': current_row['dec'],524                    'photoz_best': current_row['photoz_best'],525                    'RE_F606': current_row['RE_F606'],526                    'RE_F814': current_row['RE_F814'],527                    'RE_F125': current_row['RE_F125'],528                    'RE_F160': current_row['RE_F160'],529                    'N_F606': current_row['N_F606'],530                    'N_F814': current_row['N_F814'],531                    'N_F125': current_row['N_F125'],532                    'N_F160': current_row['N_F160'],533                    'B_T_m': current_row['B_T_m'],534                    filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)535            num_test += 1536            num_total += 1537            continue538    # At last, save the catalogs539    for i in range(num_filters):540        galaxy_per_filter = galaxy_main + filter_strings[i] + '-band/'541        gmn_train_catalogs[i].to_csv(galaxy_per_filter + 'gmn_train.csv', index=False)542        gmn_eval_catalogs[i].to_csv(galaxy_per_filter + 'gmn_eval.csv', index=False)543        psf_train_catalogs[i].to_csv(galaxy_per_filter + 'catalog_train.csv', index=False)544        psf_eval_catalogs[i].to_csv(galaxy_per_filter + 'catalog_eval.csv', index=False)545        test_catalogs[i].to_csv(galaxy_per_filter + 'catalog_test.csv', index=False)546    # Print out how many galaxies are selected547    print(str(num_total) + ' galaxies are selected in total:')548    print(str(num_gmn_train) + ' galaxies in train set for GaMorNet')549    print(str(num_gmn_eval) + ' galaxies in eval set for GaMorNet')550    print(str(num_psf_train) + ' galaxies in train set for PSFGAN')551    print(str(num_psf_eval) + ' galaxies in eval set for PSFGAN')552    print(str(num_test) + ' galaxies in common test set')553    print(str(num_resized) + ' galaxies have been resized for having different initial sizes')554    print(str(num_correctly_resized) + ' galaxies have been CORRECTLY resized')555    print(str(num_negative_flux) + ' galaxies are discarded for having negative flux(es) in at least one filter')556if __name__ == '__main__':...log_reader.py
Source:log_reader.py  
1# Standard Library2import base643import itertools4import json5from dataclasses import asdict, dataclass6from typing import Iterable, List, Optional7# Sematic8from sematic import storage9from sematic.abstract_future import FutureState10from sematic.db.models.resolution import Resolution, ResolutionKind, ResolutionStatus11from sematic.db.queries import get_resolution, get_run12from sematic.resolvers.cloud_resolver import (13    END_INLINE_RUN_INDICATOR,14    START_INLINE_RUN_INDICATOR,15)16from sematic.scheduling.external_job import JobType17# Why the "V1"? Because we will likely want to change the structure of18# the logs such that each file contains a different subset of logs. But19# when we make this change, we will still want logs written in the old20# structure to be readable, at least for a while. So we need to identify21# which structure the files are in somehow, and a v1/v2 prefix is how we22# can do it.23V1_LOG_PATH_FORMAT = "logs/v1/run_id/{run_id}/{log_kind}/"24def log_prefix(run_id: str, job_type: JobType):25    return V1_LOG_PATH_FORMAT.format(run_id=run_id, log_kind=job_type.value)26@dataclass27class LogLineResult:28    """Results of a query for log lines29    Attributes30    ----------31    more_before:32        Are there more lines before the first line returned?33    more_after:34        Are there more lines after the last line returned? Will be True35        if the answer is known to be yes, False if the answer is known to36        be no. If the answer is not known (i.e. run may still be in37        progress), True will be returned.38    lines:39        The actual log lines40    continuation_cursor:41        A string that can be used to continue traversing these logs from where you left42        off. If more_after is False, this will be set to None.43    log_unavailable_reason:44        A human-readable reason why logs are not available.45    """46    more_before: bool47    more_after: bool48    lines: List[str]49    continuation_cursor: Optional[str]50    log_unavailable_reason: Optional[str] = None51@dataclass52class Cursor:53    """A cursor representing a particular place in the process of traversing logs.54    Attributes55    ----------56    source_log_key:57        The storage key for the log that was being used when the search left off. If no58        logs have been found yet, will be None59    source_file_line_index:60        The line number BEFORE filters are applied within the log file being read.61        It will be the first line that HASN'T yet been read. If no logs have been found,62        will be -1.63    filter_strings:64        The fillter strings that were used for this log traversal.65    run_id:66        The run id that was being used for this log traversal.67    """68    # Why include source log file? Because we will soon likely want to break up69    # the logs for a single run such that each file contains a *different*70    # portion of the logs, and we will need to know which file to go to in71    # order to pick back up. The alternative would be to require72    # re-traversing already traversed files when continuing.73    source_log_key: Optional[str]74    source_file_line_index: int75    filter_strings: List[str]76    run_id: str77    def to_token(self) -> str:78        return str(79            base64.b64encode(json.dumps(asdict(self)).encode("utf8")), encoding="utf8"80        )81    @classmethod82    def from_token(cls, token: str):83        kwargs = json.loads(84            base64.b64decode(bytes(token, encoding="utf8")).decode("utf8")85        )86        return Cursor(**kwargs)87    @classmethod88    def nothing_found(cls, filter_strings: List[str], run_id: str):89        return Cursor(90            source_log_key=None,91            source_file_line_index=-1,92            filter_strings=filter_strings,93            run_id=run_id,94        )95@dataclass96class LogLine:97    source_file: str98    source_file_index: int99    line: str100def load_log_lines(101    run_id: str,102    continuation_cursor: Optional[str],103    max_lines: int,104    filter_strings: Optional[List[str]] = None,105) -> LogLineResult:106    """Load a portion of the logs for a particular run107    Parameters108    ----------109    run_id:110        The id of the run to get logs for111    continuation_cursor:112        A cursor indicating where to continue reading logs from. Should be113        None if the logs are being read from the beginning.114    max_lines:115        The highest number of log lines that should be returned at once116    filter_strings:117        Only log lines that contain ALL of the strings in this list will118        be included in the result119    Returns120    -------121    A subset of the logs for the given run122    """123    run = get_run(run_id)124    run_state = FutureState[run.future_state]  # type: ignore125    still_running = not (run_state.is_terminal() or run_state == FutureState.RAN)126    resolution = get_resolution(run.root_id)127    filter_strings = filter_strings if filter_strings is not None else []128    cursor = (129        Cursor.from_token(continuation_cursor)130        if continuation_cursor is not None131        else Cursor.nothing_found(filter_strings, run_id)132    )133    if cursor.run_id != run_id:134        raise ValueError(135            f"Tried to continue a log search of {run_id} using a "136            f"continuation cursor from {cursor.run_id}"137        )138    if set(cursor.filter_strings) != set(filter_strings):139        raise ValueError(140            f"Tried to continue a log search of {run_id} using a "141            f"different set of filters than were used in the cursor."142        )143    if ResolutionStatus[resolution.status] in (  # type: ignore144        ResolutionStatus.CREATED,145        ResolutionStatus.SCHEDULED,146    ):147        return LogLineResult(148            more_before=False,149            more_after=True,150            lines=[],151            continuation_cursor=cursor.to_token(),152            log_unavailable_reason="Resolution has not started yet.",153        )154    filter_strings = filter_strings if filter_strings is not None else []155    if FutureState[run.future_state] == FutureState.CREATED:  # type: ignore156        return LogLineResult(157            more_before=False,158            more_after=True,159            lines=[],160            continuation_cursor=cursor.to_token(),161            log_unavailable_reason="The run has not yet started executing.",162        )163    # looking for external jobs to determine inline is only valid164    # since we know the run has at least reached SCHEDULED due to it165    # not being CREATED.166    is_inline = len(run.external_jobs) == 0167    if is_inline:168        return _load_inline_logs(169            run_id=run_id,170            resolution=resolution,171            still_running=still_running,172            cursor_file=cursor.source_log_key,173            cursor_line_index=cursor.source_file_line_index,174            max_lines=max_lines,175            filter_strings=filter_strings,176        )177    return _load_non_inline_logs(178        run_id=run_id,179        still_running=still_running,180        cursor_file=cursor.source_log_key,181        cursor_line_index=cursor.source_file_line_index,182        max_lines=max_lines,183        filter_strings=filter_strings,184    )185def _get_latest_log_file(prefix, cursor_file) -> Optional[str]:186    # recall that for v1 logs, each log file contains ALL the logs from187    # the beginning of the run until the time that file was uploaded. So188    # the latest log file contains all the logs we have for the run.189    log_files = storage.get_child_paths(prefix)190    if len(log_files) < 1:191        return None192    # the file wth the highest timestamp has the full logs.193    if cursor_file is not None and cursor_file not in log_files:194        raise RuntimeError(195            f"Trying to continue a log traversal from {cursor_file}, but "196            f"that file doesn't exist."197        )198    latest_log_file = max(199        log_files,200        key=lambda path_key: int(201            path_key.replace(prefix, "").replace(".log", "".replace("/", ""))202        ),203    )204    return latest_log_file205def _load_non_inline_logs(206    run_id: str,207    still_running: bool,208    cursor_file: Optional[str],209    cursor_line_index: int,210    max_lines: int,211    filter_strings: List[str],212) -> LogLineResult:213    """Load the lines for runs that are NOT inline"""214    prefix = log_prefix(run_id, JobType.worker)215    latest_log_file = _get_latest_log_file(prefix, cursor_file)216    if latest_log_file is None:217        return LogLineResult(218            more_before=False,219            more_after=still_running,220            lines=[],221            continuation_cursor=Cursor.nothing_found(filter_strings, run_id).to_token()222            if still_running223            else None,224            log_unavailable_reason="No log files found",225        )226    text_stream = storage.get_line_stream(latest_log_file)227    line_stream = (228        LogLine(source_file=latest_log_file, source_file_index=i, line=ln)229        for i, ln in zip(itertools.count(), text_stream)230    )231    return get_log_lines_from_line_stream(232        line_stream=line_stream,233        still_running=still_running,234        cursor_source_file=latest_log_file,235        cursor_line_index=cursor_line_index,236        max_lines=max_lines,237        filter_strings=filter_strings,238        run_id=run_id,239    )240def _load_inline_logs(241    run_id: str,242    resolution: Resolution,243    still_running: bool,244    cursor_file: Optional[str],245    cursor_line_index: int,246    max_lines: int,247    filter_strings: List[str],248) -> LogLineResult:249    """Load the lines for runs that are NOT inline"""250    if ResolutionKind[resolution.kind] == ResolutionKind.LOCAL:  # type: ignore251        return LogLineResult(252            more_before=False,253            more_after=False,254            lines=[],255            continuation_cursor=None,256            log_unavailable_reason=(257                "UI logs are only available for runs that "258                "(a) are executed using the CloudResolver and "259                "(b) are using the resolver in non-detached mode OR have inline=False."260            ),261        )262    prefix = log_prefix(resolution.root_id, JobType.driver)263    latest_log_file = _get_latest_log_file(prefix, cursor_file)264    if latest_log_file is None:265        return LogLineResult(266            more_before=False,267            more_after=still_running,268            continuation_cursor=Cursor.nothing_found(filter_strings, run_id).to_token()269            if still_running270            else None,271            lines=[],272            log_unavailable_reason="Resolver logs are missing",273        )274    text_stream: Iterable[str] = storage.get_line_stream(latest_log_file)275    line_stream = _filter_for_inline(text_stream, run_id, latest_log_file)276    return get_log_lines_from_line_stream(277        line_stream=line_stream,278        still_running=still_running,279        cursor_source_file=cursor_file,280        cursor_line_index=cursor_line_index,281        max_lines=max_lines,282        filter_strings=filter_strings,283        run_id=run_id,284    )285def _filter_for_inline(286    text_stream: Iterable[str], run_id: str, source_file: str287) -> Iterable[LogLine]:288    """Stream resolver logs to make a new stream with only lines for a particular run"""289    expected_start = START_INLINE_RUN_INDICATOR.format(run_id)290    expected_end = END_INLINE_RUN_INDICATOR.format(run_id)291    buffer_iterator = iter(text_stream)292    found_start = False293    file_line_index = 0294    while True:295        try:296            line = next(buffer_iterator)297        except StopIteration:298            # if a resolver dies mid-execution of an inline run,299            # we should treat the end of the existing lines as300            # the end of whatever inline we were looking for.301            break302        if expected_start in line:303            found_start = True304            continue305        if not found_start:306            continue307        if expected_end in line:308            break309        yield LogLine(310            source_file=source_file, source_file_index=file_line_index, line=line311        )312        file_line_index += 1313def get_log_lines_from_line_stream(314    line_stream: Iterable[LogLine],315    still_running: bool,316    cursor_source_file: Optional[str],317    cursor_line_index: int,318    max_lines: int,319    filter_strings: List[str],320    run_id: str,321) -> LogLineResult:322    """Given a stream of log lines, produce an object containing the desired subset323    Parameters324    ----------325    line_stream:326        An iterable stream of log lines327    still_running:328        A boolean indicating whether the run these logs are for is still running or not329    cursor_source_file:330        The source file to continue from. No lines should be returned until this file is331        reached.332    cursor_line_index:333        The source file to continue from. No lines should be returned until this source334        file index is reached.335    max_lines:336        The maximum number of lines that should be returned337    filter_strings:338        A list of strings to filter log lines by. Only log lines that contain ALL of the339        filters will be returned.340    run_id:341        The id of the run the traversal is for.342    Returns343    -------344    A subset of the logs for the given run345    """346    buffer_iterator = iter(line_stream)347    keep_going = True348    lines = []349    has_more = True350    more_before = False351    source_file = None352    source_file_line_index = -1353    found_cursor = False354    def passes_filter(line: LogLine) -> bool:355        return all(substring in line.line for substring in filter_strings)356    while keep_going:357        try:358            line = next(ln for ln in buffer_iterator)359            source_file = line.source_file360            source_file_line_index = line.source_file_index361            if not found_cursor:362                if (363                    cursor_source_file is None364                    or source_file == cursor_source_file365                    and source_file_line_index >= cursor_line_index366                ):367                    found_cursor = True368                else:369                    more_before = more_before or passes_filter(line)370                    continue371            if not passes_filter(line):372                continue373            lines.append(line.line)374            if len(lines) >= max_lines:375                has_more = True376                keep_going = False377        except StopIteration:378            keep_going = False379            # hit the end of the logs produced so far. If the run is380            # done, there are no more logs. Otherwise more might show381            # up!382            has_more = still_running383    missing_reason = None if len(lines) > 0 else "No matching log lines."384    return LogLineResult(385        more_before=more_before,386        more_after=has_more,387        lines=lines,388        continuation_cursor=Cursor(389            source_log_key=source_file,390            # +1: next time we want to start AFTER where we last read391            source_file_line_index=source_file_line_index + 1,392            filter_strings=filter_strings,393            run_id=run_id,394        ).to_token()395        if has_more396        else None,397        log_unavailable_reason=missing_reason,...heuristic_filtering.py
Source:heuristic_filtering.py  
1import re2from typing import List, Union, Set3def process_label(label: str, lowercase: bool = True, stop_words: Set[str] = None) -> Union[List[str], None]:4    """Heuristically filter and process label(s)"""5    if not label:6        return None7    # Handle multi-labels8    label_delimiters_regex = re.compile('|'.join([';', '/']))9    labels = set(l.strip() for l in re.split(label_delimiters_regex, label))10    filter_strings = ['section', 'etc', 'now', 'whereas', 'exhibit ',11                      'therefore', 'article', 'in witness whereof', 'schedule', 'article']12    filtered_labels = set([])13    for label in labels:14        if len(label) < 3 or len(label) > 75 or  \15            not label[0].isupper() or  \16                any(bw for bw in filter_strings if label.lower().startswith(bw)):17            continue18        if label[-1] in ['.', ':']:  # remove scraping artifacts19            label = label[:-1]20        label = re.sub('[ \t]+', ' ', label.replace('\n', ' ').strip())21        if label:22            if stop_words:23                if label.lower() in stop_words:24                    continue25                label_words = label.split(' ')26                if len(label_words) > 1:27                    if len(label_words[-1]) > 1 and label_words[-1].lower() in stop_words:28                        continue29                    if (label_words[0].lower() in stop_words or label_words[0].lower() in {'without', 'due'}) and \30                            label_words[0].lower() not in {'other', 'further', 'no', 'not', 'own', 'off'}:31                        continue32            label = label.lower() if lowercase else label33            filtered_labels.add(label)34    return list(filtered_labels)35def process_text(text: str) -> Union[str, None]:36    """Heuristically filter and process provision text"""37    text = text.strip()38    filter_strings = ["â means", '" means', 'shall mean', "' means", 'â means'39                      'shall have the meaning', 'has the meaning', 'have meaning']40    if len(text) < 25 or \41            text[0].islower() or \42            text[0] in ['"', 'â'] or \43            any(bw for bw in filter_strings if bw in text):44        return None45    text = text.strip()46    if text[0] in ['.', ':']:47        text = text[1:].strip()48    if not text[0].isupper() and not text[0] in ['(', '[']:49        return None50    if not text[-1] == '.':51        return None52    text = re.sub('[ \t]+', ' ', text.replace('\n', ' ').strip())...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
