How to use examples_by_label method in hypothesis

Best Python code snippet using hypothesis

helpers.py

Source:helpers.py Github

copy

Full Screen

1from constants import *2import numpy as np, os, pickle, struct, socket3from copy import deepcopy4from bisect import bisect_left5import shutil67def get_difference(set1, set2):8 """Gets set1 - set2."""9 set1 = set(set1); set2 = set(set2)10 return list(set1.difference(set2))11def get_asn_dist(asns):12 # ASN_LIST -> %13 asn_dist = np.zeros(len(ASN_LIST))14 for asn in asns:15 try:16 asn_dist[ASN_LIST[asn]] += 117 except KeyError:18 asn_dist[ASN_LIST["OTHER"]] += 119 assert np.sum(asn_dist) > 02021 return asn_dist / np.sum(asn_dist) 2223def make_mask(n):24 """return a mask of n bits as a long integer"""25 return (2<<n-1) - 12627def dotted_quad_to_num(ip):28 """convert decimal dotted quad string to long integer"""29 return struct.unpack('<L',socket.inet_aton(ip))[0]3031def network_mask(ip,bits):32 """Convert a network address to a long integer"""33 return dotted_quad_to_num(ip) & make_mask(bits)3435def address_in_network(ip, net, netmask):36 """Is an address in a network"""37 return ip & netmask == net3839def is_internal(ip):40 ip_num = dotted_quad_to_num(ip)41 for net in INTERNAL_NETWORKS:42 net = net.split("/")43 if address_in_network(ip_num,dotted_quad_to_num(net[0]),make_mask(int(net[1]))):44 return True45 return False4647def get_ip_likelihood(ip_list, _type, modify=False):4849 ip_likelihood = 050 if os.path.exists(os.path.join(METADATA_DIR, KNOWN_IP_LIST_FN)):51 known_ips = pickle.load(open(os.path.join(METADATA_DIR, KNOWN_IP_LIST_FN),'rb'))52 else:53 known_ips = {"_types": ["twitch","youtube","netflix"], "data": {"twitch": [], "youtube": [], "netflix": []}}54 _types = known_ips["_types"]55 # TODO update this as model gets more complicated56 ip_likelihood = [len(set(ip_list) & set(known_ips["data"][_type])) / len(set(ip_list)) for _type in _types]5758 if _type == "no_video":59 # no_video ips are too numerous to track60 return ip_likelihood6162 63 if modify:64 # These IPs were communicated with when accessing this service, 65 # add them to the data structure66 # TODO - incorporate some sort of frequency, staleness, likelihood thing here67 for ip in ip_list: # maybe change this to /24, although these could be v6 IP's, and there's no /24 analogy there68 known_ips["data"][_type].append(ip)69 known_ips["data"][_type] = list(set(known_ips["data"][self.type]))70 pickle.dump(known_ips, open(os.path.join(self.metadata_dir, self.known_ip_list_fn),'wb'))7172 return ip_likelihood7374class discrete_cdf:75 # Got from https://tinyurl.com/y6dlvbsb76 def __init__(self, data,weighted=False):77 self.weighted=weighted78 if weighted:79 # assume data is tuple (value, count of value)80 self._data = [el[0] for el in data]81 self._counts = [el[1] for el in data]82 self._data_len = float(np.sum(self._counts)) # "length" is number of everything83 else:84 self._data = data85 self._data_len = float(len(data))8687 def __call__(self, point):88 if self.weighted:89 return np.sum(self._counts[:bisect_left(self._data, point)]) / self._data_len90 else:91 return (len(self._data[:bisect_left(self._data, point)]) /92 self._data_len)9394def get_cdf_xy(data, logx=False, logy=False, n_points = 500, weighted=False):95 """Returns x, cdf for your data on either log-lin or lin-lin plot."""9697 # sort it98 if weighted:99 data.sort(key = lambda val : val[0]) # sort by the value, not the weight of the value100 else:101 data.sort()102103 if logx:104 if weighted:105 if data[0][0] <= 0:106 log_low = -1; 107 else:108 log_low = np.floor(np.log10(data[0][0]))109 log_high = np.ceil(np.log10(data[-1][0]))110 else:111 if data[0] <= 0: # check for bad things before you pass them to log112 log_low = -1113 else:114 log_low = np.floor(np.log10(data[0]))115 log_high = np.ceil(np.log10(data[-1]))116 x = np.logspace(log_low,log_high,num=n_points)117 elif logy:118 # Do an inverted log scale on the y axis to get an effect like119 # .9, .99, .999, etcc120 log_low = -5121 log_high = 0122 x = np.linspace(data[0], data[-1], num=n_points)123 else:124 if weighted:125 x = np.linspace(data[0][0], data[-1][0], num=n_points)126 else:127 x = np.linspace(data[0], data[-1],num=n_points)128129 # Generate the CDF130 cdf_data_obj = discrete_cdf(data, weighted=weighted)131 cdf_data = [cdf_data_obj(point) for point in x]132133 return [x, cdf_data]134135def get_even_train_split(all_x, all_y, all_metadata, train_proportion, 136 verbose=True, is_dist=False):137 # forms train and validation sets138 # y is an array of labels, for various problem types139 # each problem type is limited by a sub-class (the one with the least examples)140 # form a training set for each problem type that maximizes # of limiting examples we train on141 # while retaining an even number of examples from each sub-class142143 # returns x_train, y_train, x_val, y_val144 # each x,y -> problem_type -> examples, labels145146 # if is_dist is true, each element of y is a list of distributions, where each 147 # distrbution represents the label for that example148149 n_problem_types = len(all_y[0])150 X = {151 "train": {i: [] for i in range(n_problem_types)},152 "val": {i: [] for i in range(n_problem_types)},153 }154 Y = {155 "train": {i: [] for i in range(n_problem_types)},156 "val": {i: [] for i in range(n_problem_types)},157 }158 metadata = {159 "train": {i: [] for i in range(n_problem_types)},160 "val": {i: [] for i in range(n_problem_types)},161 }162163 for problem_type in range(n_problem_types):164 these_labels = [_y[problem_type] for _y in all_y]165 # Number of classes for this problem type166 if is_dist:167 num_sub_classes = len(these_labels[0])168 else:169 num_sub_classes = len(set(these_labels))170 # Get the limiting sub-class for this problem type171 if is_dist:172 # Count the distribution as belonging to the class associated with the 173 # most common label174 these_labels_int = [np.argmax(el) for el in these_labels]175 else:176 these_labels_int = these_labels177 u, c = np.unique(these_labels_int, return_counts = True)178 print("Problem type: {} U: {} C: {}".format(problem_type, u,c))179 if len(c) - 1 != np.max(these_labels_int):180 raise ValueError("You need at least two examples for each sub-class -- {} -- {}.".format(u,c))181 limiting_factor = np.min(c)182 limiting_subclass = np.argmin(c)183 if verbose:184 print("Limiting number for problem type {} is {} examples, subclass {}.".format(185 problem_type, limiting_factor, limiting_subclass))186 if is_dist:187 examples_by_label = [[(x,_y, md) for x,_y, md in zip(all_x, these_labels, all_metadata) if np.argmax(_y) == y] for y in range(num_sub_classes)]188 else:189 examples_by_label = [[(x,_y, md) for x,_y, md in zip(all_x, these_labels, all_metadata) if _y == y] for y in range(num_sub_classes)]190 # Number of examples of each sub-class to use in the training set191 n_to_pull = int(limiting_factor*train_proportion)192 example_indices_by_class = {}193 # Get indices of examples of train and val to use, for each sub-class194 example_indices_by_class["train"] = [np.random.choice(range(len(this_class)), 195 size=n_to_pull, replace=False) for this_class in examples_by_label]196 example_indices_by_class["val"] = [get_difference(range(len(this_class)), train_examples_this_class) for 197 train_examples_this_class, this_class in zip(example_indices_by_class["train"], examples_by_label)]198 # Fill in the examples & labels, given the indices for each sub-class199 for k in example_indices_by_class:200 for i,example_indices in enumerate(example_indices_by_class[k]):201 [X[k][problem_type].append(examples_by_label[i][j][0]) for j in example_indices]202 [Y[k][problem_type].append(examples_by_label[i][j][1]) for j in example_indices]203 [metadata[k][problem_type].append(examples_by_label[i][j][2]) for j in example_indices]204 return X["train"], Y["train"], X["val"], Y["val"], metadata['train'], metadata['val']205206def service_tls_hostnames(hostname):207 if "googlevideo.com" in hostname:208 return "youtube"209 elif "nflxvideo.net" in hostname:210 return "netflix"211 elif "ttvnw.net" in hostname:212 return "twitch"213 else:214 raise KeyError215216def clear_path(path):217 """218 Clears the entire directory represented by path if it exists.219 """220 if not os.path.exists(path):221 return222223 for file in os.listdir(path):224 file_path = os.path.join(path, file)225 try:226 if os.path.isfile(file_path):227 os.unlink(file_path)228 elif os.path.isdir(file_path):229 shutil.rmtree(file_path)230 except Exception as e:231 print(e)232233def make_path(path):234 """235 Makes all directories represented by path236 """237 if not os.path.exists(path): ...

Full Screen

Full Screen

table_structure_analysis.py

Source:table_structure_analysis.py Github

copy

Full Screen

1# !/usr/bin/env python2# -*- coding: utf-8 -*-3######################################################################4#5# (c) Copyright University of Southampton, 20206#7# Copyright in this software belongs to University of Southampton,8# Highfield, University Road, Southampton SO17 1BJ9#10# Created By : Juliusz Ziomek11# Created Date : 2020/09/0912# Project : GloSAT13#14######################################################################15import numpy as np16import warnings17from sklearn.cluster import DBSCAN18area = lambda box: (box[2]-box[0]) * (box[3] - box[1]) if box[2]>=box[0] and box[3]>=box[1] else 019def run_dbs_1D(cells:list, eps:int,include_outliers=True,min_samples=2) -> list: 20 '''21 Runs DBScan in 1d and returns the average values for each label.22 If outliers are detected (label = -1), each of them is appended to the average values.23 '''24 centers = np.array([cells]).reshape(-1,1)25 labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(centers)26 examples_by_label = {label:[] for label in labels}27 mean_by_label = dict()28 for no, center in enumerate(centers):29 examples_by_label[labels[no]].append(center)30 for label in examples_by_label:31 if label!= -1 :32 mean_by_label[label] = sum(examples_by_label[label])/len(examples_by_label[label])33 return list(mean_by_label.values()) + (examples_by_label[-1] if -1 in examples_by_label.keys() and include_outliers else [])34def reconstruct_table(cells:list,table:list,eps:int) -> (list,list):35 '''36 Reconstructs the cells, given the table region using DBScan with hyperparmeter eps.37 '''38 table_width = table[2] - table[0]39 table_height = table[3] - table[1]40 #Normalise cells41 cells = [[(cell[0]-table[0])/table_width,(cell[1]-table[1])/table_height,(cell[2]-table[0])/table_width,(cell[3]-table[1])/table_height] for cell in cells]42 cells_x = [0,1]43 cells_y = [0,1]44 for cell in cells:45 cells_x += [cell[0], cell[2]]46 cells_y += [cell[1], cell[3]] 47 eps_x, eps_y = check_hyperparams(cells,eps) 48 rows = run_dbs_1D(cells_y,eps_y)49 cols = run_dbs_1D(cells_x,eps_x)50 rows = [(int)(row * table_height) + table[1] for row in rows]51 cols = [(int)(col * table_width) + table[0] for col in cols]52 return rows,cols53def reconstruct_table_coarse_and_fine(coarse_cells:list,fine_cells:list,table:list,eps:int) -> (list,list):54 '''55 Reconstructs the cells, given the table region using DBScan with hyperparmeter eps.56 '''57 table_width = table[2] - table[0]58 table_height = table[3] - table[1]59 rows = []60 cols = []61 if fine_cells!=[]:62 #Normalise cells63 fine_cells = [[(cell[0]-table[0])/table_width,(cell[1]-table[1])/table_height,(cell[2]-table[0])/table_width,(cell[3]-table[1])/table_height] for cell in fine_cells]64 cells_x = [0,1]65 cells_y = [0,1]66 for cell in fine_cells:67 cells_x += [cell[0], cell[2]]68 cells_y += [cell[1], cell[3]]69 fine_eps_x, fine_eps_y = check_hyperparams(fine_cells,eps)70 rows += run_dbs_1D(cells_y,fine_eps_y)71 cols += run_dbs_1D(cells_x,fine_eps_x)72 if coarse_cells!=[]:73 coarse_cells = [[(cell[0]-table[0])/table_width,(cell[1]-table[1])/table_height,(cell[2]-table[0])/table_width,(cell[3]-table[1])/table_height] for cell in coarse_cells]74 cells_x = [0,1]75 cells_y = [0,1]76 for cell in coarse_cells:77 cells_x += [cell[0], cell[2]]78 cells_y += [cell[1], cell[3]] 79 eps_x, eps_y = check_hyperparams(coarse_cells,eps) 80 rows += run_dbs_1D(cells_y,eps_y)81 cols += run_dbs_1D(cells_x,eps_x)82 if fine_cells!=[]:83 rows = run_dbs_1D(rows,fine_eps_y)84 cols = run_dbs_1D(cols,fine_eps_x)85 86 elif coarse_cells!=[]:87 rows = run_dbs_1D(rows,eps_y)88 cols = run_dbs_1D(cols,eps_x)89 rows = [(int)(row * table_height) + table[1] for row in rows]90 cols = [(int)(col * table_width) + table[0] for col in cols]91 return rows,cols92def check_hyperparams(cells:list,eps:int) -> (int,int):93 '''94 Check whether the eps paramter is smaller than avarega width and height of cell.95 If one of those conditions is violated, prints a warning.96 Returns adjusted hyperparameters for x and y.97 '''98 diff_x, diff_y = [], []99 for cell in cells:100 diff_x.append(cell[2] - cell[0])101 diff_y.append(cell[3] - cell[1])102 avg_diff_x = sum(diff_x)/len(diff_x)103 avg_diff_y = sum(diff_y)/len(diff_y)104 105 if avg_diff_x/2<eps:106 warnings.warn("Hyperparameter eps = {} larger than half of average cell size in x. Changing to {}".format(eps,avg_diff_x/2),RuntimeWarning)107 eps_x = avg_diff_x/2108 else:109 eps_x = eps110 if avg_diff_y/2<eps:111 warnings.warn("Hyperparameter eps = {} larger than half of average cell size in y. Changing to {}".format(eps,avg_diff_y/2),RuntimeWarning)112 eps_y = avg_diff_y/2113 else:114 eps_y = eps115 return eps_x,eps_y116def how_much_contained(box1:list,box2:list) -> int:117 '''118 Checks how much of the first box lies inside the second one.119 '''120 area1 = area(box1)121 intersection_box = [max(box1[0],box2[0]),122 max(box1[1],box2[1]),123 min(box1[2],box2[2]),124 min(box1[3],box2[3])]125 intersection_area = area(intersection_box)...

Full Screen

Full Screen

compute_centroids_plus.py

Source:compute_centroids_plus.py Github

copy

Full Screen

1import numpy as np2import glob3from matplotlib import pyplot as plt4import os5import sys6from collections import defaultdict7from util import *8from sklearn.cluster import KMeans9def compute_centroids():10 """11 Computes centroid for each data file given.12 """13 centroids = {}14 cnts = defaultdict(int)15 idx_to_category, _ = get_category_mappings()16 K = len(idx_to_category)17 #train_examples = np.load("data/split/train_examples.npy")18 #train_labels = np.load("data/split/train_labels.npy")19 examples_by_label = [train_examples[np.where(train_labels[:] == j)] for j in range(K)]20 #for i in range(train_examples.shape[0]):21 # idx = train_labels[i]22 # if train_labels[i] not in examples_by_label:23 # examples_by_label[idx] = np.array(train_examples[i], dtype=np.float32)24 # else:25 # examples_by_label[idx] = np.append(examples_by_label[idx],\26 # np.array(train_examples[i], dtype=np.float32))27 for idx, category in enumerate(idx_to_category):28 if idx%10 == 0:29 print("Done with category", idx)30 kmeans = KMeans(n_clusters=5, random_state=0).fit(examples_by_label[idx])31 clusters = kmeans.cluster_centers_32 category = idx_to_category[idx]33 for a in range(5):34 name = category + "_" + str(a)35 centroids[name] = clusters[a]36 37 return centroids38def create_centroids_dir():39 """40 Create centroids directory to save results.41 """42 try:43 os.makedirs("centroids_plus_normalized/")44 os.makedirs("centroids_plus_normalized/npy")45 os.makedirs("centroids_plus_normalized/png")46 except OSError:47 pass # already exists48def save_centroids(centroids):49 """50 Save all images of centroids to centroids/png.51 Save all numpy arrays of centroids to centroids/npy.52 """53 for category, centroid in centroids.items():54 plt.imshow(np.reshape(centroid, (28, 28)), cmap='gray')55 plt.title(category)56 save_path = os.path.join("centroids_plus_normalized", category)57 plt.savefig("centroids_plus_normalized/png/"+category)58 np.save("centroids_plus_normalized/npy/"+category, centroid)59 # plt.show()60if __name__ == "__main__":61 if not os.path.isdir("data/split"):62 sys.exit("Need data directory.")63 centroids = compute_centroids()64 create_centroids_dir() 65 save_centroids(centroids)...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run hypothesis automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful