Best Python code snippet using autotest_python
decision_tree_classifier.py
Source:decision_tree_classifier.py  
1import collections2import math3"""This program creates a decision tree for data pased on the attributes of the data. 4It currently works with categorical and quantitative attributes. It returns a preorder list of the decision tree."""5training_data = [6    ['Sunny', 'Hot', 'High', 'False', 'No'],7    ['Sunny', 'Hot', 'High', 'True', 'No'],8    ['Overcast', 'Hot', 'High', 'False', 'Yes'],9    ['Rainy', 'Mild', 'High', 'False', 'Yes'],10    ['Rainy', 'Cool', 'Normal', 'False', 'Yes'],11    ['Rainy', 'Cool', 'Normal', 'True', 'No'],12    ['Overcast', 'Cool', 'Normal', 'True', 'Yes'],13    ['Sunny', 'Mild', 'High', 'False', 'No'],14    ['Sunny', 'Cool', 'Normal', 'False', 'Yes'],15    ['Rainy', 'Mild', 'Normal', 'False', 'Yes'],16    ['Sunny', 'Mild', 'Normal', 'True', 'Yes'],17    ['Overcast', 'Mild', 'High', 'True', 'Yes'],18    ['Overcast', 'Hot', 'Normal', 'False', 'Yes'],19    ['Rainy', 'Mild', 'High', 'True', 'No']20]21orig_entropy = 022outcomes = collections.Counter(j[-1] for j in training_data)23den = sum(outcomes.values())24entropy = 025for j in outcomes.values():26    orig_entropy -= (j / den) * math.log((j / den), 10)27def recursive(dataset, used_attributes):28    if not dataset:29        print('stop1')30        return None31    if len(collections.Counter(j[-1] for j in dataset)) == 1:32        print('stop3', 'classify as', dataset[0][-1])33        return None34    if len(used_attributes) == (len(dataset[0]) - 1):                 # these are the termination conditions35        print('stop2', 'likely need more attibutes')36        return None37    max_info_gain = 038    for i in range(len(dataset[0]) - 1): # checks each attribute of a data vector, -1 to not include the respondent39        if i not in used_attributes:40            if type(dataset[0][i]) is str:41                cur_attr_values = collections.Counter([j[i] for j in dataset])42                partitions = {}43                for k in cur_attr_values.keys():44                    partitions[k] = []45                for m in dataset:46                    partitions[m[i]].append(m)47                weighted_entropy = 048                for n in partitions:49                    outcomes = collections.Counter(j[-1] for j in partitions[n])50                    den = sum(outcomes.values())51                    entropy = 052                    for j in outcomes.values():53                        entropy -= (j / den) * math.log((j / den), 10)54                    weighted_entropy += entropy * len(partitions[n]) / len(dataset)55                cur_info_gain = orig_entropy - weighted_entropy56                if cur_info_gain >= max_info_gain:57                    max_info_gain = cur_info_gain58                    use_partition = partitions59                    attribute = i60            else:      #this attribute is non-categorical data, i.e., real number61                sorted_values = []62                for p in dataset:63                    if p[i] not in sorted_values:64                        sorted_values.append(p[i])65                sorted_values.sort()66                split_values = [(sorted_values[0] / 2)]67                for t in range(len(sorted_values) - 1):68                    split_values.append((sorted_values[t] + sorted_values[t + 1]) / 2)69                partitions = {}70                for k in split_values:71                    partitions[k] = [[],[]]72                for m in dataset:73                    for l in split_values:74                        if m[i] < l:75                            partitions[l][0].append(m)76                        else:77                            partitions[l][1].append(m)78                for h in partitions:79                    weighted_entropy = 080                    for r in range(2):81                        outcomes = collections.Counter(j[-1] for j in partitions[h][r])82                        den = sum(outcomes.values())83                        entropy = 084                        for j in outcomes.values():85                            entropy -= (j / den) * math.log((j / den), 10)86                        weighted_entropy += entropy * len(partitions[h][r]) / len(dataset)87                    cur_info_gain = orig_entropy - weighted_entropy88                    if cur_info_gain >= max_info_gain:89                        max_info_gain = cur_info_gain90                        use_partition = {h : partitions[h]}91                        attribute = i92    print('new node for attribute', attribute)93    if type(dataset[0][attribute]) is str:94        for k in use_partition.values():    95            print('under', k[0][attribute], 'for attribute',attribute)96            recursive(k, (used_attributes + [attribute]))97    98    else:99        for k in use_partition.values():100            for b in k:101                if b == k[0]:102                    temp = 'less than'103                else:104                    temp = 'greater than'105                print('under', temp, use_partition.keys())106                recursive(b, (used_attributes + [attribute]))...regression_tree.py
Source:regression_tree.py  
1import collections2"""This program creates a regression tree for data pased on the attributes of the data. 3It currently works with categorical and quantitative attributes. It returns a preorder list of the regression tree."""4training_data = [5    ['Sunny', 'Hot', 'High', 'False', 25],6    ['Sunny', 'Hot', 'High', 'True', 30],7    ['Overcast', 'Hot', 'High', 'False', 46],8    ['Rainy', 'Mild', 'High', 'False', 45],9    ['Rainy', 'Cool', 'Normal', 'False', 52],10    ['Rainy', 'Cool', 'Normal', 'True', 23],11    ['Overcast', 'Cool', 'Normal', 'True', 43],12    ['Sunny', 'Mild', 'High', 'False', 35],13    ['Sunny', 'Cool', 'Normal', 'False', 38],14    ['Rainy', 'Mild', 'Normal', 'False', 46],15    ['Sunny', 'Mild', 'Normal', 'True', 48],16    ['Overcast', 'Mild', 'High', 'True', 52],17    ['Overcast', 'Hot', 'Normal', 'False', 44],18    ['Rainy', 'Mild', 'High', 'True', 30]19]20tolerance = 621def std_dev(dataset, population):22    if len(dataset) == 1:23        return 024    values = []25    for i in dataset:26        values.append(i[-1])27    mean = sum(values) / len(values)28    squared_deviations = []29    for j in values:30        squared_deviations.append((j - mean) ** 2)31    if population:32        den = len(dataset)33    else:34        den = len(dataset) - 135    variance = sum(squared_deviations) / den36    std_dev = variance ** 0.537    return std_dev38orig_std_dev = std_dev(training_data, True)39def recursive(dataset, used_attributes):40    if not dataset:41        print('stop1')42        return None43    if len(dataset) == 1:44        print('stop3', 'classify as', dataset[0][-1])45        return None46    if std_dev(dataset, False) <= tolerance:47        print('stop4 classify as', (sum(j[-1] for j in dataset) / len(dataset)))48        return None49    if len(used_attributes) == (len(dataset[0]) - 1):                 # these are the termination conditions50        print('stop2', 'likely need more attibutes')51        return None52    max_std_red = 053    for i in range(len(dataset[0]) - 1): # checks each attribute of a data vector, -1 to not include the respondent54        if i not in used_attributes:55            if type(dataset[0][i]) is str:56                cur_attr_values = collections.Counter([j[i] for j in dataset])57                partitions = {}58                for k in cur_attr_values.keys():59                    partitions[k] = []60                for m in dataset:61                    partitions[m[i]].append(m)62                weighted_std_dev = 063                for k in partitions:64                    cur_std_dev = std_dev(partitions[k], False)65                    weighted_std_dev += cur_std_dev * len(partitions[k]) / len(dataset)66                cur_std_red = orig_std_dev - weighted_std_dev67                if cur_std_red >= max_std_red:68                    max_std_red = cur_std_red69                    use_partition = partitions70                    attribute = i71            else:      #this attribute is non-categorical data, i.e., real number72                sorted_values = []73                for p in dataset:74                    if p[i] not in sorted_values:75                        sorted_values.append(p[i])76                sorted_values.sort()77                split_values = [(sorted_values[0] / 2)]78                for t in range(len(sorted_values) - 1):79                    split_values.append((sorted_values[t] + sorted_values[t + 1]) / 2)80                partitions = {}81                for k in split_values:82                    partitions[k] = [[],[]]83                for m in dataset:84                    for l in split_values:85                        if m[i] < l:86                            partitions[l][0].append(m)87                        else:88                            partitions[l][1].append(m)89                for h in partitions:90                    cur_std_dev = std_dev(h,False)91                    cur_std_red = orig_std_dev - cur_std_dev92                    if cur_std_red >= max_std_red:93                        max_std_red = cur_std_red94                        use_partition = {h : partitions[h]}95                        attribute = i96    print('new node for attribute', attribute)97    if type(dataset[0][attribute]) is str:98        for k in use_partition.values():    99            print('under', k[0][attribute], 'for attribute',attribute)100            recursive(k, (used_attributes + [attribute]))101    102    else:103        for k in use_partition.values():104            for b in k:105                if b == k[0]:106                    temp = 'less than'107                else:108                    temp = 'greater than'109                print('under', temp, use_partition.keys())110                recursive(b, (used_attributes + [attribute]))...text_lengths_collector.py
Source:text_lengths_collector.py  
1from __future__ import annotations2import json3import logging4import statistics5from pytorch_ie.annotations import Span6from transformers import AutoTokenizer7from pie_utils.statistics import WithStatistics8from ..types import DocumentWithPartitions9logger = logging.getLogger(__name__)10class TextLengthsCollector(WithStatistics):11    """This document processor collects the text lengths in means of token numbers and allows to12    show them as json dict and, if plotext is installed, as histogram. Its nature is purely13    statistical, it does not modify the documents.14    Presented values:15     * min, max, mean, and stddev of the collected text lengths,16     * num_docs (number of processed documents), and17     * if use_partition is enabled, num_parts (number of precessed parts)18    :param tokenizer_name_or_path the identifier of the Huggingface tokenizer that will be used19    :param use_partition a boolean flag to enable considering a partition, i.e. tokenize and20        collect the lengths for the partition entries (e.g. sentences or sections) individually.21    :param tokenizer_kwargs a dictionary containing further keyword arguments passed when calling22        the tokenizer23    :param plotext_kwargs a dictionary containing further keyword arguments passed when calling24        plotext.hist().25    """26    def __init__(27        self,28        tokenizer_name_or_path: str,29        use_partition: bool | None = False,30        tokenizer_kwargs: dict | None = None,31        plotext_kwargs: dict | None = None,32    ):33        self.use_partition = use_partition34        self.tokenizer_name_or_path = tokenizer_name_or_path35        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name_or_path)36        self.tokenizer_kwargs = tokenizer_kwargs or {}37        self.plotext_kwargs = plotext_kwargs or {}38        self.reset_statistics()39    def reset_statistics(self):40        self.text_lengths = []41        self.num_docs = 042        self.num_parts = 043    def show_statistics(self, description: str | None = None):44        description = description or "Statistics for text lengths"45        caption = f"{description} (tokenizer_name_or_path={self.tokenizer_name_or_path})"46        try:47            import plotext as plt48            plt.clf()49            plt.hist(data=self.text_lengths, **self.plotext_kwargs)50            plt.title(caption)51            plt.show()52        # exclude from test coverage since this would require to uninstall plotext and53        # just a simple logging is performed here54        except ModuleNotFoundError:  # pragma: no cover55            logger.info("install plotext to display the data as histogram at the console")56        stats = {57            "min": min(self.text_lengths),58            "max": max(self.text_lengths),59            "mean": statistics.mean(self.text_lengths),60            "stddev": statistics.pstdev(self.text_lengths),61            "num_docs": self.num_docs,62        }63        if self.use_partition:64            stats["num_parts"] = self.num_parts65        logger.info(f"{caption}):\n{json.dumps(stats, indent=2)}")66    def __call__(self, document: DocumentWithPartitions) -> DocumentWithPartitions:67        partition = (68            document.partitions if self.use_partition else [Span(start=0, end=len(document.text))]69        )70        tokenized = self.tokenizer(71            [document.text[part.start : part.end] for part in partition], **self.tokenizer_kwargs72        )73        new_lengths = [len(encoding) for encoding in tokenized.encodings]74        self.text_lengths.extend(new_lengths)75        self.num_parts += len(partition)76        self.num_docs += 1...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
