Best Python code snippet using green
genotype_filters.py
Source:genotype_filters.py  
1"""2Filters on the result of boolean operations on multiple samples genotypes,3such as "all have the same genotype" or "all are homozygous".4"""5from django.conf import settings6from varapp.filters.apply_bitwise import c_apply_bitwise  # from cython extension7from varapp.constants.filters import FILTER_CLASS_GENOTYPE8from varapp.constants.genotype import *9from varapp.data_models.samples import SamplesSelection10from varapp.data_models.variants import *11from varapp.filters.filters import Filter, FilterResult, FiltersCollection12from varapp.variants.genotypes_service import genotypes_service13from varapp.variants.variants_factory import set_source14import abc, itertools, multiprocessing as mp15import numpy as np16from functools import reduce17from operator import attrgetter, itemgetter, __and__18from time import time19AND = 'AND'20OR = 'OR'21DEBUG = True and settings.DEBUG22def merge_conditions_array(conds):23    """If there are multiple affected samples sharing the same parents,24    the conditions can be redundant. Simplify the conditions array so that25    there is at most one for each genotype/sample. If there are several constraints26    for the same genotype, check that they are compatible and take the strongest27    (lowest bit value).28    :param conds: an array of couples [sample_index, genotype_bit]29    :rtype: same as input30    """31    merged = []32    if not conds:33        return merged34    # Group by sample index, and get a single common bit for all conds on that sample35    conds.sort(key=itemgetter(0))36    for idx,group in itertools.groupby(conds, itemgetter(0)):37        genbits = [x[1] for x in group]   # only the genotype bit38        common_bits = reduce(__and__, genbits)39        merged.append((idx, common_bits))40    return merged41class GenotypesFilter(Filter):42    """Defines a way to *apply* a filter on variants genotypes."""43    __metaclass__ = abc.ABCMeta44    filter_class = FILTER_CLASS_GENOTYPE45    need_groups = []  # The required group names in the samples selection for the filter to work.46    need_parents = 0  # Whether 0/1/2 parents are required for the filter to work47    def __init__(self, ss:SamplesSelection, val, name='genotype', op='=', db=None):48        super().__init__(name=name, op=op, val=val, ss=ss, db=db)49        self.nsamples = len(ss.active_idx)50        self.merge_op = AND51        self.shortcut = False  # Flag: if True, don't filter anything52        # Need at least one active sample53        if len(self.ss.active_idx) == 0:54            self.shortcut = True55        # If parents are required, check that both are present for at least one of the affected samples56        mothers_aff = [ss.mother_idx_of(s) for s in ss.affected]57        fathers_aff = [ss.father_idx_of(s) for s in ss.affected]58        if self.need_parents == 2 and all(None in x for x in zip(mothers_aff, fathers_aff)):59            self.shortcut = True60        elif self.need_parents == 1 and all((x,y)==(None,None) for x,y in zip(mothers_aff, fathers_aff)):61            self.shortcut = True62        # If certain groups are required, check that they are present in the selection63        if any((x not in ss.groups.keys() or len(ss.groups[x]) == 0) for x in self.need_groups):64            self.shortcut = True65        # The compound case implements its own stuff, but otherwise do that:66        if self.val != GENOTYPE_COMPOUND:67            conditions_array = self.build_conditions_array()68            self.conditions_array = merge_conditions_array(conditions_array)69            if len(self.conditions_array) == 0:70                self.shortcut = True71            self.conditions_vector = self.build_conditions_vector(self.conditions_array)72    def build_conditions_array(self):73        """Construct a list of lists [sample_idx, BITCODE], one for each sample.74        Then a variant passes if in its decoded gts, there is BITCODE at position idx.75        Once only: it is proper to the filter (with the list of all possible samples,76        but no samples selection)."""77        raise NotImplementedError("No `build_conditions_array` method implemented.")78    def build_conditions_vector(self, conditions_array):79        """From a *conditions_array*, of elements [sample_idx, BITCODE],80        build a vector of size len(active_samples) with BITCODE at indices81        where a condition is given, and GENOTYPE_BIT_ANY elsewhere.82        :rtype: np.ndarray[uint8]83        """84        active_idx = self.ss.active_idx85        conds = GENOTYPE_BIT_ANY * np.ones(len(active_idx), dtype=np.uint8)86        shift = {idx:i for i,idx in enumerate(active_idx)}87        for idx,bit in conditions_array:88            conds[shift[idx]] = bit89        return conds90    def scan_genotypes(self, genotypes, sub_ids=None, db=None):91        """Pass through all genotypes and return only the indices of those that pass the filter.92        :param genotypes: np.ndarray[uint64, dim=2]93        :rtype: np.ndarray[uint64]"""94        if self.shortcut:95            return np.zeros(0)96        N = len(genotypes)97        if sub_ids is not None:98            variant_ids = sub_ids99        elif self.val == 'x_linked' and db:100            variant_ids = genotypes_service(db).chrX101        else:102            variant_ids = np.asarray(range(1,N+1), dtype=np.uint64)103        active_idx = np.asarray(self.ss.active_idx, dtype=np.uint16)104        conditions = self.conditions_vector105        is_and = self.merge_op == AND106        if len(conditions) == 0:107            passing = variant_ids108        else:109            passing = self.parallel_apply_bitwise(genotypes, variant_ids, conditions, active_idx, is_and)110        return passing111    @staticmethod112    def parallel_apply_bitwise(genotypes, variant_ids, conditions, active_idx, is_and):113        """Run c_apply_bitwise in parallel. Takes the same arguments."""114        N = len(genotypes)115        nprocs = mp.cpu_count()116        pool = mp.Pool(processes=nprocs)117        B = round(N/nprocs + 0.5)  # batch size118        # Split variant_ids in batches (genotype batches are equally-sized, but not119        #   variant ids, in case a subset was given)120        split_at = variant_ids.searchsorted([(k+1)*B+1 for k in range(nprocs-1)])121        variant_ids_batches = np.split(variant_ids, split_at)122        assert len(variant_ids_batches) == nprocs123        # Run one job for each batch124        passing = [pool.apply(c_apply_bitwise,125            args=(genotypes[k*B:(k+1)*B,:],126                   variant_ids_batches[k],127                   conditions, active_idx, is_and, B))128            for k in range(nprocs)]129        passing = np.concatenate(passing)130        pool.close()131        return passing132    #@timer133    def apply(self, variants=None, genotypes=None, db=None, limit=None, offset=0):134        """Apply this collection of filters on a collection of variants.135        :param variants: a VariantsCollection or a QuerySet of variants.136            If None, makes a QuerySet of the whole *db*.137        :param db: database name. If no set, it tries to be inferred from *variants*.138        :param genotypes: a list of genotypes arrays.139            if None, a GenotypesService is created from the variants' db.140            In principle, set it for testing purposes only.141        :rtype: FilterResult142        """143        sub_ids = None144        if variants is None and db is not None:145            variants = Variant.objects.using(db)146        elif db is None:147            db = variants.db148        if self.shortcut:149            return FilterResult(variants=VariantsCollection([]), ids=[], n_filtered=0)150        if genotypes is None:151            assert db is not None, "Either a db name or a genotypes array is required"152            genotypes = genotypes_service(db).genotypes153        else:154            assert len(genotypes) == len(variants)155        if self.val == 'x_linked':156            if isinstance(variants, VariantsCollection):157                sub_ids = np.asarray([v.variant_id for v in variants if v.chrom=='chrX'], dtype=np.uint64)158            else:159                sub_ids = genotypes_service(db).chrX160        passing = self.scan_genotypes(genotypes, sub_ids=sub_ids, db=db)161        return FilterResult(162            variants=self.variants_from_mask(variants, passing, db, limit, offset),163            ids=passing,164            n_filtered=len(passing),165        )166    @staticmethod167    def variants_from_mask(variants, passing, db=None, limit=None, offset=0):168        """Get the collection of variants which id is in *passing*."""169        if limit is not None:170            passing = passing[offset:offset+limit]171        passing = set(passing)172        return VariantsCollection([v for v in variants if v.variant_id in passing], db=db)173    def __str__(self):174        return "<Filter {}>".format(self.short_str()) + ('-'+str(self.ss) if self.ss else '')175    def __repr__(self):176        return "<Filter {}>".format(self.short_str()) + ('-'+str(self.ss) if self.ss else '')177class GenotypesFilterDoNothing(GenotypesFilter):178    """A filter that every variant passes anyway."""179    def __init__(self, ss:SamplesSelection, db=None):180        super().__init__(ss, 'nothing', db=db)181    def build_conditions_array(self):182        assert self183        return [[i, GENOTYPE_BIT_ANY] for i in self.ss.active_idx]184class GenotypesFilterActive(GenotypesFilter):185    """Return a variant only if it is mutant in at least one of the active samples.186    """187    def __init__(self, ss:SamplesSelection, db=None):188        super().__init__(ss, GENOTYPE_ACTIVE, db=db)189        self.merge_op = OR190    def build_conditions_array(self):191        return [[i, GENOTYPE_BIT_CARRIER] for i in self.ss.active_idx]192class GenotypesFilterDominant(GenotypesFilter):193    """Simplest scenario: autosomal dominant.194    Suppose the effect is dominant, i.e. one allele195    mutated is enough to observe a phenotype.196    Filter variants that are mutated in all samples but the controls.197    """198    need_groups = ["affected"]199    def __init__(self, ss:SamplesSelection, db=None):200       super().__init__(ss, GENOTYPE_DOMINANT, db=db)201    def build_conditions_array(self):202        return [[i, GENOTYPE_BIT_CARRIER] for i in self.ss.affected_idx] + \203               [[i, GENOTYPE_BIT_NON_CARRIER] for i in self.ss.not_affected_idx]204class GenotypesFilterRecessive(GenotypesFilter):205    """Suppose the effect is recessive, i.e. a child must inherit a mutated206    allele from both carrier parents to have an observable phenotype.207    Filter mutations that are present in both the parents and homozygous208    in the "affected" children.209    Controls ("not_affected") are samples known to be non-carriers.210    """211    need_groups = ["affected"]212    def __init__(self, ss:SamplesSelection, db=None):213        super().__init__(ss, GENOTYPE_RECESSIVE, db=db)214    def build_conditions_array(self):215        conds = []  # 1 per sample, because of its particular parents216        for s in self.ss.affected:217            idx = self.ss.idx_of(s.name, active=True)218            conds.append([idx, GENOTYPE_BIT_CARRIER_HOM])219            for i in self.ss.parents_idx_of(s):220                conds.append([i, GENOTYPE_BIT_CARRIER])221        for i in self.ss.not_affected_idx:222            conds.append([i, GENOTYPE_BIT_NOT_CARRIER_HOM])223        return conds224class GenotypesFilterDeNovo(GenotypesFilter):225    """Case where a mutation is present in a child but not in the parents.226    So the controls should be the parents, but can include other non-carriers.227    Otherwise it is the same as the Dominant case.228    """229    need_groups = ["affected"]230    need_parents = 2231    def __init__(self, ss:SamplesSelection, db=None):232        super().__init__(ss, GENOTYPE_DENOVO, db=db)233    def build_conditions_array(self):234        conds = []   # 1 per sample, because of its particular parents235        for s in self.ss.affected:236            idx = self.ss.idx_of(s.name, active=True)237            parents_idx = self.ss.parents_idx_of(s)238            if len(parents_idx) == 2:   # pointless if not both parents present239                if len(set(parents_idx) & set(self.ss.affected_idx)) > 0:240                    continue            # pointless if one of the parents is affected241                conds.append([idx, GENOTYPE_BIT_CARRIER_HET])242                for i in parents_idx:243                    conds.append([i, GENOTYPE_BIT_NON_CARRIER])244        if conds:245            for i in self.ss.not_affected_idx:246                conds.append([i, GENOTYPE_BIT_NON_CARRIER])247        return conds248class GenotypesFilterXLinked(GenotypesFilter):249    """A deleterious mutation os present on chromosome X. Possible cases:250    a) Dominant case: Apart from the proportion of affected children251       of each sex, it behaves exactly like a usual dominant mutation,252       so we don't cover that case here:253       - Affected <=> carrier;254       - In principle one of the parents should carry it, but it could be de novo.255    b) Recessive case:256       - Affected <=> woman carrier hom, or man carrier het;257       - For a woman, both parents must be carriers (and the father is affected);258       - For a man, only the mother must be carrier.259    """260    need_groups = ["affected"]261    need_parents = 0262    def __init__(self, ss:SamplesSelection, db=None):263        super().__init__(ss, GENOTYPE_XLINKED, db=db)264    def build_conditions_array(self):265        conds = []  # 1 per sample, because of its particular parents266        for s in self.ss.affected:267            idx = self.ss.idx_of(s.name, active=True)268            # Male: carrier het, and the mother is carrier269            if s.sex == 'M':270                conds.append([idx, GENOTYPE_BIT_CARRIER_HET])271                i = self.ss.mother_idx_of(s)272                if i is not None:273                    conds.append([i, GENOTYPE_BIT_CARRIER])274            # Female: carrier hom, and both parents are carriers275            elif s.sex == 'F':276                conds.append([idx, GENOTYPE_BIT_CARRIER_HOM])277                for i in self.ss.parents_idx_of(s):278                    conds.append([i, GENOTYPE_BIT_CARRIER])279        for s in self.ss.not_affected:280            idx = self.ss.idx_of(s.name, active=True)281            # Male unaffected cannot be carriers282            if s.sex == 'M':283                conds.append([idx, GENOTYPE_BIT_NON_CARRIER])284            # Female unaffected could be carrier het285            elif s.sex == 'F':286                conds.append([idx, GENOTYPE_BIT_NOT_CARRIER_HOM])287        return conds288class GenotypesFilterCompoundHeterozygous(GenotypesFilter):289    """Case where two mutations, inherited one from each parent,290    occur in the same gene and thus code for two defective proteins.291    Compose two results:292        - father is carrier in that gene and child has it;293        - mother is carrier in that same gene and child has it.294    Notes:295    - We cannot group conditions for many samples as we did before, because296      they can be touched by different compounds pairs in the same gene (rare ?).297    - Neither of the parents can be homozygous, or he would be affected (both proteins are touched).298    - A child cannot be homozygous at any position of the compounds pair, because299      that would suffice to invalidate both proteins and is indistinguishable from the300      recessive case.301    - Both parents could be affected at one position of the compounds pair (rare ?).302    """303    need_groups = ["affected"]304    need_parents = 2305    def __init__(self, ss:SamplesSelection, db=None):306        super().__init__(ss, val=GENOTYPE_COMPOUND, db=db)307        self.conditions_array = self.build_conditions_array()308        if not self.conditions_array:309            self.shortcut = True310        else:311            self.conditions_vector = self.build_compound_conditions_vector()312    def build_conditions_array(self):313        """Returns pairs of condition (paternal, maternal), one for each sample,314        in a dict {sample_name: [cond1, cond2]}.315        Make it also for non affected, because we want to find false positives searching316        as if they were affected. An unaffected sample could well carry one of the two variants.317        """318        conds = {}319        # Common condition: all affected are carriers het, and no unaffected can be homozygous320        base_cond = [(i, GENOTYPE_BIT_NOT_CARRIER_HOM) for i in self.ss.not_affected_idx] \321                +   [(i, GENOTYPE_BIT_CARRIER_HET) for i in self.ss.affected_idx]322        for s in self.ss.active:323            idx = self.ss.idx_of(s.name, active=True)324            father_idx = self.ss.father_idx_of(s)325            mother_idx = self.ss.mother_idx_of(s)326            if father_idx is None or mother_idx is None:327                continue328            if father_idx in self.ss.affected_idx or mother_idx in self.ss.affected_idx:329                continue            # pointless if one of the parents is affected330            # Father carrier331            c1 = base_cond + [332                (idx, GENOTYPE_BIT_CARRIER_HET),  # in case it is not affected, but we simulate for false positives333                (father_idx, GENOTYPE_BIT_CARRIER),334                (mother_idx, GENOTYPE_BIT_NON_CARRIER),335            ]336            # Mother carrier337            c2 = base_cond + [338                (idx, GENOTYPE_BIT_CARRIER_HET),339                (father_idx, GENOTYPE_BIT_NON_CARRIER),340                (mother_idx, GENOTYPE_BIT_CARRIER),341            ]342            # Note: c1 and c2 cannot both be true at the same genomic position343            c1 = tuple(merge_conditions_array(c1))344            c2 = tuple(merge_conditions_array(c2))345            conds[s.name] = (c1, c2)346            # Remove duplicate conditions to speed it up347            seen = set()348            dups = set()349            for k,v in conds.items():350                if v in seen:351                    dups.add(k)352                else:353                    seen.add(v)354            for name in dups:355                conds.pop(name)356        return conds357    def build_compound_conditions_vector(self):358        """Extend *self.build_conditions_vector()* to apply it to all sub-elements359        *c1*,*c2* of the more complicated {sample: [c1, c2]} of the compound case."""360        conditions = {}361        for sample, conds in self.conditions_array.items():362            conditions[sample] = [None,None]363            conditions[sample][0] = self.build_conditions_vector(conds[0])364            conditions[sample][1] = self.build_conditions_vector(conds[1])365        return conditions366    def apply(self, variants=None, genotypes=None, db=None, limit=None, offset=0, sub_ids=None, parallel=True):367        """:param sub_ids: does nothing, just inheritance"""368        if self.shortcut:369            return FilterResult(variants=VariantsCollection([]), ids=[], n_filtered=0)370        if variants is None and db is not None:371            variants = Variant.objects.using(db)372        elif db is None:373            db = variants.db374        if db is None:375            batches = {gene: np.array([v.variant_id for v in var], dtype=np.uint64)376                for gene,var in itertools.groupby(variants, key=attrgetter('gene_symbol'))}377        else:378            gs = genotypes_service(db)379            batches = gs.variant_ids_batches_by_gene380        if genotypes is None:381            assert db is not None, "Either a db name or a genotypes array is required"382            genotypes = genotypes_service(db).genotypes383        else:384            assert len(genotypes) == len(variants)385        passing, sources, pairs = self.scan_genotypes_compound(genotypes, batches, parallel)386        variants = self.variants_from_mask(variants, passing, db, limit, offset)387        for v in variants:388            set_source(v, sources[v.variant_id])389        return FilterResult(390            variants=variants,391            ids=passing,392            n_filtered=len(passing),393        )394    def scan_genotypes_compound(self, genotypes, batches, parallel=True):395        """Scan the *genotypes* array for compounds. Variant ids are treated in batches,396           - one list of variant_ids per gene."""397        if self.shortcut:398            passing, sources, pairs = np.zeros(0), {}, []399        else:400            N = len(genotypes)401            active_idx = np.asarray(self.ss.active_idx, dtype=np.uint16)402            batches = list(batches.items())403            if parallel:404                passing, sources, pairs = self.parallel_batches(genotypes, batches, active_idx, N)405            else:406                passing, sources, pairs = self.process_batches(genotypes, batches, active_idx, N)407            passing = np.array(list(passing), dtype=np.uint64)408            passing.sort()409        return passing, sources, pairs410    def parallel_batches(self, genotypes, batches, active_idx, N):411        """Parallelize the scanning of genotypes for compounds over groups of genes."""412        passing = set()413        sources = {}414        pairs = []415        nprocs = mp.cpu_count()416        NB = len(batches)417        B = round(NB/nprocs + 0.5)  # batch size418        split_batches = [batches[k*B:(k+1)*B] for k in range(nprocs)]419        if DEBUG and 0:420            print("  @parallel_batches {} CPUs: {}".format(nprocs, [len(x) for x in split_batches]))421        pool = mp.Pool(processes=nprocs)422        res = [pool.apply_async(self.process_batches,423            args=(np.copy(genotypes), list(split_batches[k]), np.copy(active_idx), N))424            for k in range(nprocs)]425        output = [x.get() for x in res]426        for x in output:427            passing |= x[0]428            sources.update(x[1])429            pairs += x[2]430        pool.close()431        return passing, sources, pairs432    def process_batches(self, genotypes, batches, active_idx, N):433        """Search a batch of genes for compounds."""434        passing = set()435        sources = {}436        pairs = []437        tbatch = 0438        for gene,variant_ids in batches:439            t1 = time()440            local_passing, local_sources, local_pairs = self.process_1_batch(variant_ids, genotypes, active_idx, N)441            t2 = time()442            tbatch += t2-t1443            passing |= local_passing444            pairs += local_pairs445            sources.update(local_sources)446        if DEBUG and 0:447            print("  Processed batches in {:.3f}s ({} passing)".format(tbatch,len(passing)))448        return passing, sources, pairs449    def process_1_batch(self, variant_ids, genotypes, active_idx, N):450        """Search 1 gene for compounds. Return:451        local_passing: set of variant_ids passing the filter452        local_sources: dict `{variant_id: 'paternal'/'maternal'}`453        local_pairs: list of compound pairs `(variant_id1, variant_id2)`454        """455        # Check that all affected samples have the compound456        local_passing_mother = set()457        local_passing_father = set()458        local_sources = {}459        for affected in self.ss.affected:460            if affected.name not in self.conditions_vector:461                continue462            conds = self.conditions_vector[affected.name]463            passing_father = set(c_apply_bitwise(genotypes, variant_ids, conds[0], active_idx, True, N))464            passing_mother = set(c_apply_bitwise(genotypes, variant_ids, conds[1], active_idx, True, N))465            # Exclude compounds that healthy samples carry as well466            if len(passing_father) > 0 and len(passing_mother) > 0:467                fp1 = set()468                fp2 = set()469                local_ids = np.array(list(passing_father | passing_mother), dtype=np.uint64)470                for healthy in self.ss.not_affected:471                    if healthy.name not in self.conditions_vector:472                        continue473                    conds = np.asarray(self.conditions_vector[healthy.name], dtype=np.uint8)474                    false_father = c_apply_bitwise(genotypes, local_ids, conds[0], active_idx, True, N)475                    false_mother = c_apply_bitwise(genotypes, local_ids, conds[1], active_idx, True, N)476                    false_pairs = list(itertools.product(false_father, false_mother))477                    for p1, p2 in false_pairs:478                        if p1 in passing_father and p2 in passing_mother:479                            fp1.add(p1)480                            fp2.add(p2)481                passing_father = passing_father - fp1482                passing_mother = passing_mother - fp2483                # If there are any left in both lists, add them to the result set484                if len(passing_father) > 0 and len(passing_mother) > 0:485                    for k in passing_father:486                        local_sources[k] = 'paternal'487                    for k in passing_mother:488                        local_sources[k] = 'maternal'489                    if len(local_passing_father) == 0:490                        local_passing_father = passing_father491                    else:492                        local_passing_father &= passing_father493                    if len(local_passing_mother) == 0:494                        local_passing_mother = passing_mother495                    else:496                        local_passing_mother &= passing_mother497            # All affected samples must have at least one of the combinations498            else:499                local_passing_father = set()500                local_passing_mother = set()501                local_sources = {}502                break  # go to next gene503        local_passing = local_passing_father | local_passing_mother504        local_pairs = list(itertools.product(505            map(int,local_passing_father),   # map to int because of new numpy warning when used as index506            map(int,local_passing_mother)507        ))...School_Analysis.py
Source:School_Analysis.py  
1#!/usr/bin/env python2# coding: utf-83# # PyCity Schools Analysis4# 5# * As a whole, schools with higher budgets, did not yield better test results. By contrast, schools with higher spending per student actually (\$645-675) underperformed compared to schools with smaller budgets (<\$585 per student).6# 7# * As a whole, smaller and medium sized schools dramatically out-performed large sized schools on passing math performances (89-91% passing vs 67%).8# 9# * As a whole, charter schools out-performed the public district schools across all metrics. However, more analysis will be required to glean if the effect is due to school practices or the fact that charter schools tend to serve smaller student populations per school. 10# ---11# In[1]:12# Dependencies and Setup13import pandas as pd14# File to Load (Remember to Change These)15school_data_to_load = "Resources/schools_complete.csv"16student_data_to_load = "Resources/students_complete.csv"17# Read School and Student Data File and store into Pandas Data Frames18school_data = pd.read_csv(school_data_to_load)19student_data = pd.read_csv(student_data_to_load)20# Combine the data into a single dataset21school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])22# ## District Summary23# In[2]:24# Calculate the Totals (Schools and Students)25school_count = len(school_data_complete["school_name"].unique())26student_count = school_data_complete["Student ID"].count()27# Calculate the Total Budget28total_budget = school_data["budget"].sum()29# Calculate the Average Scores30average_math_score = school_data_complete["math_score"].mean()31average_reading_score = school_data_complete["reading_score"].mean()32overall_passing_rate = (average_math_score + average_reading_score) / 233# Calculate the Percentage Pass Rates34passing_math_count = school_data_complete[(school_data_complete["math_score"] >= 70)].count()["student_name"]35passing_math_percentage = passing_math_count / float(student_count) * 10036passing_reading_count = school_data_complete[(school_data_complete["reading_score"] >= 70)].count()["student_name"]37passing_reading_percentage = passing_reading_count / float(student_count) * 10038# Minor Data Cleanup39district_summary = pd.DataFrame({"Total Schools": [school_count], 40                                 "Total Students": [student_count], 41                                 "Total Budget": [total_budget],42                                 "Average Math Score": [average_math_score], 43                                 "Average Reading Score": [average_reading_score],44                                 "% Passing Math": [passing_math_percentage],45                                 "% Passing Reading": [passing_reading_percentage],46                                 "% Overall Passing Rate": [overall_passing_rate]})47district_summary = district_summary[["Total Schools", "Total Students", "Total Budget",48                                     "Average Math Score", 49                                     "Average Reading Score",50                                     "% Passing Math",51                                     "% Passing Reading",52                                     "% Overall Passing Rate"]]53district_summary["Total Students"] = district_summary["Total Students"].map("{:,}".format)54district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,.2f}".format)55# Display the data frame56district_summary57# ## School Summary58# In[3]:59# Determine the School Type60school_types = school_data.set_index(["school_name"])["type"]61# Calculate the total student count62per_school_counts = school_data_complete["school_name"].value_counts()63# Calculate the total school budget and per capita spending64# per_school_budget = school_data_complete.groupby(["school_name"]).mean()["budget"]65per_school_budget = school_data_complete.groupby(["school_name"]).mean()["budget"]66per_school_capita = per_school_budget / per_school_counts67# Calculate the average test scores68per_school_math = school_data_complete.groupby(["school_name"]).mean()["math_score"]69per_school_reading = school_data_complete.groupby(["school_name"]).mean()["reading_score"]70# Calculate the passing scores by creating a filtered data frame71school_passing_math = school_data_complete[(school_data_complete["math_score"] >= 70)]72school_passing_reading = school_data_complete[(school_data_complete["reading_score"] >= 70)]73per_school_passing_math = school_passing_math.groupby(["school_name"]).count()["student_name"] / per_school_counts * 10074per_school_passing_reading = school_passing_reading.groupby(["school_name"]).count()["student_name"] / per_school_counts * 10075overall_passing_rate = (per_school_passing_math + per_school_passing_reading) / 276# Convert to data frame77per_school_summary = pd.DataFrame({"School Type": school_types,78                                   "Total Students": per_school_counts,79                                   "Total School Budget": per_school_budget,80                                   "Per Student Budget": per_school_capita,81                                   "Average Math Score": per_school_math,82                                   "Average Reading Score": per_school_reading,83                                   "% Passing Math": per_school_passing_math,84                                   "% Passing Reading": per_school_passing_reading,85                                   "% Overall Passing Rate": overall_passing_rate})86# Minor data munging87per_school_summary = per_school_summary[["School Type", "Total Students", "Total School Budget", "Per Student Budget",88                                         "Average Math Score", "Average Reading Score", 89                                         "% Passing Math", "% Passing Reading", 90                                         "% Overall Passing Rate"]]91per_school_summary["Total School Budget"] = per_school_summary["Total School Budget"].map("${:,.2f}".format)92per_school_summary["Per Student Budget"] = per_school_summary["Per Student Budget"].map("${:,.2f}".format)93# Display the data frame94per_school_summary95# ## Top Performing Schools (By Passing Rate)96# In[4]:97# Sort and show top five schools98top_schools = per_school_summary.sort_values(["% Overall Passing Rate"], ascending=False)99top_schools.head(5)100# ## Bottom Performing Schools (By Passing Rate)101# In[5]:102# Sort and show bottom five schools103bottom_schools = per_school_summary.sort_values(["% Overall Passing Rate"], ascending=True)104bottom_schools.head(5)105# ## Math Scores by Grade106# In[6]:107# Create data series of scores by grade levels using conditionals108ninth_graders = school_data_complete[(school_data_complete["grade"] == "9th")]109tenth_graders = school_data_complete[(school_data_complete["grade"] == "10th")]110eleventh_graders = school_data_complete[(school_data_complete["grade"] == "11th")]111twelfth_graders = school_data_complete[(school_data_complete["grade"] == "12th")]112# Group each by school name113ninth_graders_scores = ninth_graders.groupby(["school_name"]).mean()["math_score"]114tenth_graders_scores = tenth_graders.groupby(["school_name"]).mean()["math_score"]115eleventh_graders_scores = eleventh_graders.groupby(["school_name"]).mean()["math_score"]116twelfth_graders_scores = twelfth_graders.groupby(["school_name"]).mean()["math_score"]117# Combine series into single data frame118scores_by_grade = pd.DataFrame({"9th": ninth_graders_scores, "10th": tenth_graders_scores,119                                "11th": eleventh_graders_scores, "12th": twelfth_graders_scores})120# Minor data munging121scores_by_grade = scores_by_grade[["9th", "10th", "11th", "12th"]]122scores_by_grade.index.name = None123# Display the data frame124scores_by_grade125# ## Reading Score by Grade 126# In[7]:127# Create data series of scores by grade levels using conditionals128ninth_graders = school_data_complete[(school_data_complete["grade"] == "9th")]129tenth_graders = school_data_complete[(school_data_complete["grade"] == "10th")]130eleventh_graders = school_data_complete[(school_data_complete["grade"] == "11th")]131twelfth_graders = school_data_complete[(school_data_complete["grade"] == "12th")]132# Group each by school name133ninth_graders_scores = ninth_graders.groupby(["school_name"]).mean()["reading_score"]134tenth_graders_scores = tenth_graders.groupby(["school_name"]).mean()["reading_score"]135eleventh_graders_scores = eleventh_graders.groupby(["school_name"]).mean()["reading_score"]136twelfth_graders_scores = twelfth_graders.groupby(["school_name"]).mean()["reading_score"]137# Combine series into single data frame138scores_by_grade = pd.DataFrame({"9th": ninth_graders_scores, "10th": tenth_graders_scores,139                                "11th": eleventh_graders_scores, "12th": twelfth_graders_scores})140# Minor data munging141scores_by_grade = scores_by_grade[["9th", "10th", "11th", "12th"]]142scores_by_grade.index.name = None143# Display the data frame144scores_by_grade145# ## Scores by School Spending146# In[8]:147# Establish the bins 148spending_bins = [0, 585, 615, 645, 675]149group_names = ["<$585", "$585-615", "$615-645", "$645-675"]150# Categorize the spending based on the bins151per_school_summary["Spending Ranges (Per Student)"] = pd.cut(per_school_capita, spending_bins, labels=group_names)152spending_math_scores = per_school_summary.groupby(["Spending Ranges (Per Student)"]).mean()["Average Math Score"]153spending_reading_scores = per_school_summary.groupby(["Spending Ranges (Per Student)"]).mean()["Average Reading Score"]154spending_passing_math = per_school_summary.groupby(["Spending Ranges (Per Student)"]).mean()["% Passing Math"]155spending_passing_reading = per_school_summary.groupby(["Spending Ranges (Per Student)"]).mean()["% Passing Reading"]156overall_passing_rate = (spending_passing_math + spending_passing_reading) / 2157# Assemble into data frame158spending_summary = pd.DataFrame({"Average Math Score" : spending_math_scores,159                                 "Average Reading Score": spending_reading_scores,160                                 "% Passing Math": spending_passing_math,161                                 "% Passing Reading": spending_passing_reading,162                                 "% Overall Passing Rate": overall_passing_rate})163# Minor data munging164spending_summary = spending_summary[["Average Math Score", 165                                     "Average Reading Score", 166                                     "% Passing Math", "% Passing Reading",167                                     "% Overall Passing Rate"]]168# Display results169spending_summary170# ## Scores by School Size171# In[9]:172# Establish the bins 173size_bins = [0, 1000, 2000, 5000]174group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]175# Categorize the spending based on the bins176per_school_summary["School Size"] = pd.cut(per_school_summary["Total Students"], size_bins, labels=group_names)177# Calculate the scores based on bins178size_math_scores = per_school_summary.groupby(["School Size"]).mean()["Average Math Score"]179size_reading_scores = per_school_summary.groupby(["School Size"]).mean()["Average Reading Score"]180size_passing_math = per_school_summary.groupby(["School Size"]).mean()["% Passing Math"]181size_passing_reading = per_school_summary.groupby(["School Size"]).mean()["% Passing Reading"]182overall_passing_rate = (size_passing_math + size_passing_reading) / 2183# Assemble into data frame184size_summary = pd.DataFrame({"Average Math Score" : size_math_scores,185                             "Average Reading Score": size_reading_scores,186                             "% Passing Math": size_passing_math,187                             "% Passing Reading": size_passing_reading,188                             "% Overall Passing Rate": overall_passing_rate})189# Minor data munging190size_summary = size_summary[["Average Math Score", 191                             "Average Reading Score", 192                             "% Passing Math", "% Passing Reading",193                             "% Overall Passing Rate"]]194# Display results195size_summary196# ## Scores by School Type197# In[10]:198# Type | Average Math Score | Average Reading Score | % Passing Math | % Passing Reading | % Overall Passing Rate199type_math_scores = per_school_summary.groupby(["School Type"]).mean()["Average Math Score"]200type_reading_scores = per_school_summary.groupby(["School Type"]).mean()["Average Reading Score"]201type_passing_math = per_school_summary.groupby(["School Type"]).mean()["% Passing Math"]202type_passing_reading = per_school_summary.groupby(["School Type"]).mean()["% Passing Reading"]203overall_passing_rate = (type_passing_math + type_passing_reading) / 2204# Assemble into data frame205type_summary = pd.DataFrame({"Average Math Score" : type_math_scores,206                             "Average Reading Score": type_reading_scores,207                             "% Passing Math": type_passing_math,208                             "% Passing Reading": type_passing_reading,209                             "% Overall Passing Rate": overall_passing_rate})210# Minor data munging211type_summary = type_summary[["Average Math Score", 212                             "Average Reading Score",213                             "% Passing Math",214                             "% Passing Reading",215                             "% Overall Passing Rate"]]216# Display results...test_pass_by_reference_or_value.py
Source:test_pass_by_reference_or_value.py  
1from __future__ import print_function2from __future__ import division3from __future__ import absolute_import4import unittest5from jnius import autoclass6class PassByReferenceOrValueTest(unittest.TestCase):7    def _verify(self, numbers, changed):8        for i in range(len(numbers)):9            self.assertEqual(numbers[i], i * i if changed else i)10    def _verify_all(self, numbers, changed):11            for n, c in zip(numbers, changed):12                self._verify(n, c)13    def test_single_param_static(self):14        VariablePassing = autoclass('org.jnius.VariablePassing')15        # passed by reference (default), numbers should change16        numbers = list(range(10))17        VariablePassing.singleParamStatic(numbers)18        self._verify(numbers, True)19        # passed by reference, numbers should change20        numbers = list(range(10))21        VariablePassing.singleParamStatic(numbers, pass_by_reference=True)22        self._verify(numbers, True)23        # passed by value, numbers should not change24        numbers = list(range(10))25        VariablePassing.singleParamStatic(numbers, pass_by_reference=False)26        self._verify(numbers, False)27    def test_single_param(self):28        VariablePassing = autoclass('org.jnius.VariablePassing')29        variablePassing = VariablePassing()30        # passed by reference (default), numbers should change31        numbers = list(range(10))32        variablePassing.singleParam(numbers)33        self._verify(numbers, True)34        # passed by reference, numbers should change35        numbers = list(range(10))36        variablePassing.singleParam(numbers, pass_by_reference=True)37        self._verify(numbers, True)38        # passed by value, numbers should not change39        numbers = list(range(10))40        variablePassing.singleParam(numbers, pass_by_reference=False)41        self._verify(numbers, False)42    def test_multiple_params_static(self):43        VariablePassing = autoclass('org.jnius.VariablePassing')44        # passed by reference (default), all numbers should change45        numbers = [list(range(10)) for _ in range(4)]46        VariablePassing.multipleParamsStatic(*numbers)47        self._verify_all(numbers, [True] * 4)48        # passed by reference, all numbers should change49        numbers = [list(range(10)) for _ in range(4)]50        VariablePassing.multipleParamsStatic(*numbers, pass_by_reference=True)51        self._verify_all(numbers, [True] * 4)52        # passed by value, no numbers should change53        numbers = [list(range(10)) for _ in range(4)]54        VariablePassing.multipleParamsStatic(*numbers, pass_by_reference=False)55        self._verify_all(numbers, [False] * 4)56        # only the first set of numbers should change57        numbers = [list(range(10)) for _ in range(4)]58        VariablePassing.multipleParamsStatic(*numbers, pass_by_reference=[True, False])59        self._verify_all(numbers, [True, False, False, False])60        # only the first set of numbers should not change61        numbers = [list(range(10)) for _ in range(4)]62        VariablePassing.multipleParamsStatic(*numbers, pass_by_reference=[False, True])63        self._verify_all(numbers, [False, True, True, True])64        # only the odd sets of numbers should change65        numbers = [list(range(10)) for _ in range(4)]66        changed = (True, False, True, False)67        VariablePassing.multipleParamsStatic(*numbers, pass_by_reference=changed)68        self._verify_all(numbers, changed)69        # only the even sets of numbers should change70        numbers = [list(range(10)) for _ in range(4)]71        changed = (False, True, False, True)72        VariablePassing.multipleParamsStatic(*numbers, pass_by_reference=changed)73        self._verify_all(numbers, changed)74    def test_multiple_params(self):75        VariablePassing = autoclass('org.jnius.VariablePassing')76        variablePassing = VariablePassing()77        # passed by reference (default), all numbers should change78        numbers = [list(range(10)) for _ in range(4)]79        variablePassing.multipleParams(*numbers)80        self._verify_all(numbers, [True] * 4)81        # passed by reference, all numbers should change82        numbers = [list(range(10)) for _ in range(4)]83        variablePassing.multipleParams(*numbers, pass_by_reference=True)84        self._verify_all(numbers, [True] * 4)85        # passed by value, no numbers should change86        numbers = [list(range(10)) for _ in range(4)]87        variablePassing.multipleParams(*numbers, pass_by_reference=False)88        self._verify_all(numbers, [False] * 4)89        # only the first set of numbers should change90        numbers = [list(range(10)) for _ in range(4)]91        variablePassing.multipleParams(*numbers, pass_by_reference=[True, False])92        self._verify_all(numbers, [True, False, False, False])93        # only the first set of numbers should not change94        numbers = [list(range(10)) for _ in range(4)]95        variablePassing.multipleParams(*numbers, pass_by_reference=[False, True])96        self._verify_all(numbers, [False, True, True, True])97        # only the odd sets of numbers should change98        numbers = [list(range(10)) for _ in range(4)]99        changed = (True, False, True, False)100        variablePassing.multipleParams(*numbers, pass_by_reference=changed)101        self._verify_all(numbers, changed)102        # only the even sets of numbers should change103        numbers = [list(range(10)) for _ in range(4)]104        changed = (False, True, False, True)105        variablePassing.multipleParams(*numbers, pass_by_reference=changed)106        self._verify_all(numbers, changed)107    def test_contructor_single_param(self):108        VariablePassing = autoclass('org.jnius.VariablePassing')109        # passed by reference (default), numbers should change110        numbers = list(range(10))111        variablePassing = VariablePassing(numbers)112        self._verify(numbers, True)113        # passed by reference, numbers should change114        numbers = list(range(10))115        variablePassing = VariablePassing(numbers, pass_by_reference=True)116        self._verify(numbers, True)117        # passed by value, numbers should not change118        numbers = list(range(10))119        variablePassing = VariablePassing(numbers, pass_by_reference=False)120        self._verify(numbers, False)121    def test_contructor_multiple_params(self):122        VariablePassing = autoclass('org.jnius.VariablePassing')123        # passed by reference (default), all numbers should change124        numbers = [list(range(10)) for _ in range(4)]125        variablePassing = VariablePassing(*numbers)126        self._verify_all(numbers, [True] * 4)127        # passed by reference, all numbers should change128        numbers = [list(range(10)) for _ in range(4)]129        variablePassing = VariablePassing(*numbers, pass_by_reference=True)130        self._verify_all(numbers, [True] * 4)131        # passed by value, no numbers should change132        numbers = [list(range(10)) for _ in range(4)]133        variablePassing = VariablePassing(*numbers, pass_by_reference=False)134        self._verify_all(numbers, [False] * 4)135        # only the first set of numbers should change136        numbers = [list(range(10)) for _ in range(4)]137        variablePassing = VariablePassing(*numbers, pass_by_reference=[True, False])138        self._verify_all(numbers, [True, False, False, False])139        # only the first set of numbers should not change140        numbers = [list(range(10)) for _ in range(4)]141        variablePassing = VariablePassing(*numbers, pass_by_reference=[False, True])142        self._verify_all(numbers, [False, True, True, True])143        # only the odd sets of numbers should change144        numbers = [list(range(10)) for _ in range(4)]145        changed = (True, False, True, False)146        variablePassing = VariablePassing(*numbers, pass_by_reference=changed)147        self._verify_all(numbers, changed)148        # only the even sets of numbers should change149        numbers = [list(range(10)) for _ in range(4)]150        changed = (False, True, False, True)151        variablePassing = VariablePassing(*numbers, pass_by_reference=changed)...database.py
Source:database.py  
1import sqlite32import pandas as pd3con = sqlite3.connect("https://drive.google.com/file/d/1nJ-AvjMkAY0e8tV3iONIcE96psd1eiVi/view?usp=sharing")4cur = con.cursor()5def get_all_countries():6    return pd.read_sql('SELECT * FROM Country;', con)7def get_country_leagues(country_name):8    query = f'SELECT DISTINCT l.name FROM League l ' \9            f'JOIN Country c ON l.country_id = c.id ' \10            f'WHERE c.name =  "{country_name}" ORDER BY l.name ASC'11    return pd.read_sql(query, con)12def get_league_teams(league_name):13    query = f'SELECT DISTINCT l.name , t.team_long_name FROM Match m  JOIN League l ' \14            f'ON m.league_id = l.id  JOIN Team t  ON m.home_team_api_id = t.team_api_id WHERE' \15            f' l.name = "{league_name}"' \16            f'ORDER BY t.team_long_name ASC;'17    return pd.read_sql(query, con)18def get_match_predictors():19    query = f'SELECT  CASE  WHEN home_team_goal > away_team_goal THEN 1 	WHEN home_team_goal < away_team_goal THEN 2 	' \20            f'ELSE 0 END AS Match_Outcome, ht_buildUpPlaySpeed, ht_buildUpPlayDribbling, ht_buildUpPlayPassing, ht_chanceCreationPassing, ht_chanceCreationCrossing,' \21            f' ht_chanceCreationShooting, ht_defencePressure, ht_defenceAggression, ht_defenceTeamWidth, at_buildUpPlaySpeed, at_buildUpPlayDribbling,' \22            f' at_buildUpPlayPassing,' \23            f'at_chanceCreationPassing, at_chanceCreationCrossing, at_chanceCreationShooting, at_defencePressure, at_defenceAggression, at_defenceTeamWidth ' \24            f'FROM Match m JOIN  ( 	SELECT team_api_id,	AVG(buildUpPlaySpeed) AS ht_buildUpPlaySpeed,' \25            f'AVG(buildUpPlayDribbling) AS ht_buildUpPlayDribbling, AVG(buildUpPlayPassing) AS ht_buildUpPlayPassing, 	' \26            f'AVG(chanceCreationPassing) AS ht_chanceCreationPassing, AVG(chanceCreationCrossing) AS ht_chanceCreationCrossing,' \27            f'AVG(chanceCreationShooting) AS ht_chanceCreationShooting,' \28            f'AVG(defencePressure) AS ht_defencePressure, ' \29            f'AVG(defenceAggression) AS ht_defenceAggression, AVG(defenceTeamWidth) AS ht_defenceTeamWidth 	FROM Team_Attributes' \30            f' GROUP BY team_api_id ) ht_attr ON ht_attr.team_api_id = home_team_api_id JOIN  ' \31            f'(SELECT team_api_id, AVG(buildUpPlaySpeed) AS at_buildUpPlaySpeed, AVG(buildUpPlayDribbling)' \32            f' AS at_buildUpPlayDribbling, AVG(buildUpPlayPassing) AS at_buildUpPlayPassing, 	 	' \33            f'AVG(chanceCreationPassing) AS at_chanceCreationPassing, AVG(chanceCreationCrossing)' \34            f' AS at_chanceCreationCrossing, AVG(chanceCreationShooting) AS at_chanceCreationShooting, 	 ' \35            f'	AVG(defencePressure) AS at_defencePressure, AVG(defenceAggression) AS at_defenceAggression, ' \36            f'AVG(defenceTeamWidth) AS at_defenceTeamWidth 	FROM Team_Attributes 	GROUP BY team_api_id ) ' \37            f' at_attr ON at_attr.team_api_id = away_team_api_id;'38    return pd.read_sql(query, con)39def get_team_predictors(home_team_name, away_team_name):40    query = f'SELECT   ht_buildUpPlaySpeed, ht_buildUpPlayDribbling, ht_buildUpPlayPassing, ht_chanceCreationPassing,' \41            f' ht_chanceCreationCrossing, ht_chanceCreationShooting, ht_defencePressure, ht_defenceAggression, ht_defenceTeamWidth, ' \42            f' at_buildUpPlaySpeed, at_buildUpPlayDribbling, at_buildUpPlayPassing, at_chanceCreationPassing, at_chanceCreationCrossing, at_chanceCreationShooting, ' \43            f'at_defencePressure, at_defenceAggression, at_defenceTeamWidth ' \44            f' FROM   (SELECT 	AVG(buildUpPlaySpeed) AS ht_buildUpPlaySpeed, AVG(buildUpPlayDribbling) AS ht_buildUpPlayDribbling, AVG(buildUpPlayPassing) ' \45            f'AS ht_buildUpPlayPassing,AVG(chanceCreationPassing) AS ht_chanceCreationPassing, ' \46            f'AVG(chanceCreationCrossing) AS ht_chanceCreationCrossing, AVG(chanceCreationShooting) AS ht_chanceCreationShooting, 	 	' \47            f'AVG(defencePressure) AS ht_defencePressure, AVG(defenceAggression) AS ht_defenceAggression, AVG(defenceTeamWidth) AS ht_defenceTeamWidth 	 	' \48            f'FROM Team_Attributes home_attr 	JOIN Team home_team ' \49            f'	ON home_attr.team_api_id =  home_team.team_api_id ' \50            f'	WHERE team_long_name = "{home_team_name}" ) ht_attr  JOIN   ( 	 ' \51            f'	SELECT  AVG(buildUpPlaySpeed) AS at_buildUpPlaySpeed, AVG(buildUpPlayDribbling) AS at_buildUpPlayDribbling, AVG(buildUpPlayPassing) ' \52            f'AS at_buildUpPlayPassing, 	 	AVG(chanceCreationPassing) ' \53            f'AS at_chanceCreationPassing, AVG(chanceCreationCrossing) AS at_chanceCreationCrossing,' \54            f' AVG(chanceCreationShooting) AS at_chanceCreationShooting, 	 AVG(defencePressure) AS at_defencePressure, AVG(defenceAggression) ' \55            f'AS at_defenceAggression, AVG(defenceTeamWidth) AS at_defenceTeamWidth 	 	' \56            f'FROM Team_Attributes away_attr 	JOIN Team away_team 	ON away_attr.team_api_id =  away_team.team_api_id 	' \57            f'WHERE team_long_name = "{away_team_name}" ) at_attr  ON 1=1  ;'...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
