Best Python code snippet using pandera_python
test_shuffle.py
Source:test_shuffle.py  
...114    df = pd.DataFrame({'x': np.random.random(100),115                       'y': np.random.random(100) // 0.2},116                      index=np.random.random(100))117    ddf = dd.from_pandas(df, npartitions=npartitions)118    assert_eq(df.set_index('x'),119              ddf.set_index('x', shuffle='tasks'))120    assert_eq(df.set_index('y'),121              ddf.set_index('y', shuffle='tasks'))122    assert_eq(df.set_index(df.x),123              ddf.set_index(ddf.x, shuffle='tasks'))124    assert_eq(df.set_index(df.x + df.y),125              ddf.set_index(ddf.x + ddf.y, shuffle='tasks'))126    assert_eq(df.set_index(df.x + 1),127              ddf.set_index(ddf.x + 1, shuffle='tasks'))128    assert_eq(df.set_index(df.index),129              ddf.set_index(ddf.index, shuffle='tasks'))130@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])131def test_set_index_self_index(shuffle):132    df = pd.DataFrame({'x': np.random.random(100),133                       'y': np.random.random(100) // 0.2},134                      index=np.random.random(100))135    a = dd.from_pandas(df, npartitions=4)136    b = a.set_index(a.index, shuffle=shuffle)137    assert a is b138    assert_eq(b, df.set_index(df.index))139@pytest.mark.parametrize('shuffle', ['tasks'])140def test_set_index_names(shuffle):141    df = pd.DataFrame({'x': np.random.random(100),142                       'y': np.random.random(100) // 0.2},143                      index=np.random.random(100))144    ddf = dd.from_pandas(df, npartitions=4)145    assert (set(ddf.set_index('x', shuffle=shuffle).dask) ==146            set(ddf.set_index('x', shuffle=shuffle).dask))147    assert (set(ddf.set_index('x', shuffle=shuffle).dask) !=148            set(ddf.set_index('y', shuffle=shuffle).dask))149    assert (set(ddf.set_index('x', max_branch=4, shuffle=shuffle).dask) !=150            set(ddf.set_index('x', max_branch=3, shuffle=shuffle).dask))151    assert (set(ddf.set_index('x', drop=True, shuffle=shuffle).dask) !=152            set(ddf.set_index('x', drop=False, shuffle=shuffle).dask))153@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])154def test_set_index_tasks_2(shuffle):155    df = dd.demo.make_timeseries(156        '2000', '2004', {'value': float, 'name': str, 'id': int},157        freq='2H', partition_freq='1M', seed=1)158    df2 = df.set_index('name', shuffle=shuffle)159    df2.value.sum().compute(get=dask.get)160@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])161def test_set_index_tasks_3(shuffle):162    df = pd.DataFrame(np.random.random((10, 2)), columns=['x', 'y'])163    ddf = dd.from_pandas(df, npartitions=5)164    ddf2 = ddf.set_index('x', shuffle=shuffle, max_branch=2,165                         npartitions=ddf.npartitions)166    df2 = df.set_index('x')167    assert_eq(df2, ddf2)168    assert ddf2.npartitions == ddf.npartitions169@pytest.mark.parametrize('shuffle', ['tasks', 'disk'])170def test_shuffle_sort(shuffle):171    df = pd.DataFrame({'x': [1, 2, 3, 2, 1], 'y': [9, 8, 7, 1, 5]})172    ddf = dd.from_pandas(df, npartitions=3)173    df2 = df.set_index('x').sort_index()174    ddf2 = ddf.set_index('x', shuffle=shuffle)175    assert_eq(ddf2.loc[2:3], df2.loc[2:3])176@pytest.mark.parametrize('shuffle', ['tasks', 'disk'])177@pytest.mark.parametrize('get', [threaded_get, mp_get])178def test_rearrange(shuffle, get):179    df = pd.DataFrame({'x': np.random.random(10)})180    ddf = dd.from_pandas(df, npartitions=4)181    ddf2 = ddf.assign(y=ddf.x % 4)182    result = rearrange_by_column(ddf2, 'y', max_branch=32, shuffle=shuffle)183    assert result.npartitions == ddf.npartitions184    assert set(ddf.dask).issubset(result.dask)185    # Every value in exactly one partition186    a = result.compute(get=get)187    parts = get(result.dask, result._keys())188    for i in a.y.drop_duplicates():189        assert sum(i in part.y for part in parts) == 1190def test_rearrange_by_column_with_narrow_divisions():191    from dask.dataframe.tests.test_multi import list_eq192    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})193    a = dd.repartition(A, [0, 4, 5])194    df = rearrange_by_divisions(a, 'x', (0, 2, 5))195    list_eq(df, a)196def test_maybe_buffered_partd():197    import partd198    f = maybe_buffered_partd()199    p1 = f()200    assert isinstance(p1.partd, partd.Buffer)201    f2 = pickle.loads(pickle.dumps(f))202    assert not f2.buffer203    p2 = f2()204    assert isinstance(p2.partd, partd.File)205def test_set_index_with_explicit_divisions():206    df = pd.DataFrame({'x': [4, 1, 2, 5]}, index=[10, 20, 30, 40])207    ddf = dd.from_pandas(df, npartitions=2)208    def throw(*args, **kwargs):209        raise Exception()210    with dask.set_options(get=throw):211        ddf2 = ddf.set_index('x', divisions=[1, 3, 5])212    assert ddf2.divisions == (1, 3, 5)213    df2 = df.set_index('x')214    assert_eq(ddf2, df2)215    # Divisions must be sorted216    with pytest.raises(ValueError):217        ddf.set_index('x', divisions=[3, 1, 5])218def test_set_index_divisions_2():219    df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')})220    ddf = dd.from_pandas(df, 2)221    result = ddf.set_index('y', divisions=['a', 'c', 'd'])222    assert result.divisions == ('a', 'c', 'd')223    assert list(result.compute(get=dask.get).index[-2:]) == ['d', 'd']224def test_set_index_divisions_compute():225    d2 = d.set_index('b', divisions=[0, 2, 9], compute=False)226    d3 = d.set_index('b', divisions=[0, 2, 9], compute=True)227    assert_eq(d2, d3)228    assert_eq(d2, full.set_index('b'))229    assert_eq(d3, full.set_index('b'))230    assert len(d2.dask) > len(d3.dask)231    d4 = d.set_index(d.b, divisions=[0, 2, 9], compute=False)232    d5 = d.set_index(d.b, divisions=[0, 2, 9], compute=True)233    exp = full.copy()234    exp.index = exp.b235    assert_eq(d4, d5)236    assert_eq(d4, exp)237    assert_eq(d5, exp)238    assert len(d4.dask) > len(d5.dask)239def test_set_index_divisions_sorted():240    p1 = pd.DataFrame({'x': [10, 11, 12], 'y': ['a', 'a', 'a']})241    p2 = pd.DataFrame({'x': [13, 14, 15], 'y': ['b', 'b', 'c']})242    p3 = pd.DataFrame({'x': [16, 17, 18], 'y': ['d', 'e', 'e']})243    ddf = dd.DataFrame({('x', 0): p1, ('x', 1): p2, ('x', 2): p3},244                       'x', p1, [None, None, None, None])245    df = ddf.compute()246    def throw(*args, **kwargs):247        raise Exception("Shouldn't have computed")248    with dask.set_options(get=throw):249        res = ddf.set_index('x', divisions=[10, 13, 16, 18], sorted=True)250    assert_eq(res, df.set_index('x'))251    with dask.set_options(get=throw):252        res = ddf.set_index('y', divisions=['a', 'b', 'd', 'e'], sorted=True)253    assert_eq(res, df.set_index('y'))254    # with sorted=True, divisions must be same length as df.divisions255    with pytest.raises(ValueError):256        ddf.set_index('y', divisions=['a', 'b', 'c', 'd', 'e'], sorted=True)257    # Divisions must be sorted258    with pytest.raises(ValueError):259        ddf.set_index('y', divisions=['a', 'b', 'd', 'c'], sorted=True)260@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])261def test_set_index_reduces_partitions_small(shuffle):262    df = pd.DataFrame({'x': np.random.random(100)})263    ddf = dd.from_pandas(df, npartitions=50)264    ddf2 = ddf.set_index('x', shuffle=shuffle, npartitions='auto')265    assert ddf2.npartitions < 10266@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])267def test_set_index_reduces_partitions_large(shuffle):268    n = 2**24269    df = pd.DataFrame({'x': np.random.random(n),270                       'y': np.random.random(n),271                       'z': np.random.random(n)})272    ddf = dd.from_pandas(df, npartitions=50, name='x', sort=False)273    ddf2 = ddf.set_index('x', shuffle=shuffle, npartitions='auto')274    assert 1 < ddf2.npartitions < 20275@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])276def test_set_index_doesnt_increase_partitions(shuffle):277    n = 2**24278    df = pd.DataFrame({'x': np.random.random(n),279                       'y': np.random.random(n),280                       'z': np.random.random(n)})281    ddf = dd.from_pandas(df, npartitions=2, name='x', sort=False)282    ddf2 = ddf.set_index('x', shuffle=shuffle, npartitions='auto')283    assert ddf2.npartitions <= ddf.npartitions284@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])285def test_set_index_detects_sorted_data(shuffle):286    df = pd.DataFrame({'x': range(100), 'y': range(100)})287    ddf = dd.from_pandas(df, npartitions=10, name='x', sort=False)288    ddf2 = ddf.set_index('x', shuffle=shuffle)289    assert len(ddf2.dask) < ddf.npartitions * 4290def test_set_index_sorts():291    # https://github.com/dask/dask/issues/2288292    vals = np.array([1348550149000000000, 1348550149000000000, 1348558142000000000,293                     1348558142000000000, 1348585928000000000, 1348585928000000000,294                     1348600739000000000, 1348601706000000000, 1348600739000000000,295                     1348601706000000000, 1348614789000000000, 1348614789000000000,296                     1348621037000000000, 1348621038000000000, 1348621040000000000,297                     1348621037000000000, 1348621038000000000, 1348621040000000000,298                     1348637628000000000, 1348638159000000000, 1348638160000000000,299                     1348638159000000000, 1348638160000000000, 1348637628000000000,300                     1348646354000000000, 1348646354000000000, 1348659107000000000,301                     1348657111000000000, 1348659107000000000, 1348657111000000000,302                     1348672876000000000, 1348672876000000000, 1348682787000000000,303                     1348681985000000000, 1348682787000000000, 1348681985000000000,304                     1348728167000000000, 1348728167000000000, 1348730745000000000,305                     1348730745000000000, 1348750198000000000, 1348750198000000000,306                     1348750198000000000, 1348753539000000000, 1348753539000000000,307                     1348753539000000000, 1348754449000000000, 1348754449000000000,308                     1348761333000000000, 1348761554000000000, 1348761610000000000,309                     1348761333000000000, 1348761554000000000, 1348761610000000000,310                     1348782624000000000, 1348782624000000000, 1348782624000000000,311                     1348782624000000000])312    vals = pd.to_datetime(vals, unit='ns')313    breaks = [10, 36, 58]314    dfs = []315    for i in range(len(breaks)):316        lo = sum(breaks[:i])317        hi = sum(breaks[i:i + 1])318        dfs.append(pd.DataFrame({"timestamp": vals[lo:hi]}, index=range(lo, hi)))319    ddf = dd.concat(dfs).clear_divisions()320    assert ddf.set_index("timestamp").index.compute().is_monotonic is True321def test_set_index():322    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 2, 6]},323                                  index=[0, 1, 3]),324           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 5, 8]},325                                  index=[5, 6, 8]),326           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [9, 1, 8]},327                                  index=[9, 9, 9])}328    d = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])329    full = d.compute()330    d2 = d.set_index('b', npartitions=3)331    assert d2.npartitions == 3332    assert d2.index.name == 'b'333    assert_eq(d2, full.set_index('b'))334    d3 = d.set_index(d.b, npartitions=3)335    assert d3.npartitions == 3336    assert d3.index.name == 'b'337    assert_eq(d3, full.set_index(full.b))338    d4 = d.set_index('b')339    assert d4.index.name == 'b'340    assert_eq(d4, full.set_index('b'))341def test_set_index_interpolate():342    df = pd.DataFrame({'x': [4, 1, 1, 3, 3], 'y': [1., 1, 1, 1, 2]})343    d = dd.from_pandas(df, 2)344    d1 = d.set_index('x', npartitions=3)345    assert d1.npartitions == 3346    assert set(d1.divisions) == set([1, 2, 3, 4])347    d2 = d.set_index('y', npartitions=3)348    assert d2.divisions[0] == 1.349    assert 1. < d2.divisions[1] < d2.divisions[2] < 2.350    assert d2.divisions[3] == 2.351def test_set_index_interpolate_int():352    L = sorted(list(range(0, 200, 10)) * 2)353    df = pd.DataFrame({'x': 2 * L})354    d = dd.from_pandas(df, 2)355    d1 = d.set_index('x', npartitions=10)356    assert all(np.issubdtype(type(x), np.integer) for x in d1.divisions)357def test_set_index_timezone():358    s_naive = pd.Series(pd.date_range('20130101', periods=3))359    s_aware = pd.Series(pd.date_range('20130101', periods=3, tz='US/Eastern'))360    df = pd.DataFrame({'tz': s_aware, 'notz': s_naive})361    d = dd.from_pandas(df, 2)362    d1 = d.set_index('notz', npartitions=2)363    s1 = pd.DatetimeIndex(s_naive.values, dtype=s_naive.dtype)364    assert d1.divisions[0] == s_naive[0] == s1[0]365    assert d1.divisions[-1] == s_naive[2] == s1[2]366    # We currently lose "freq".  Converting data with pandas-defined dtypes367    # to numpy or pure Python can be lossy like this.368    d2 = d.set_index('tz', npartitions=2)369    s2 = pd.DatetimeIndex(s_aware, dtype=s_aware.dtype)370    assert d2.divisions[0] == s2[0]371    assert d2.divisions[-1] == s2[2]372    assert d2.divisions[0].tz == s2[0].tz373    assert d2.divisions[0].tz is not None374    s2badtype = pd.DatetimeIndex(s_aware.values, dtype=s_naive.dtype)375    with pytest.raises(TypeError):376        d2.divisions[0] == s2badtype[0]377@pytest.mark.parametrize('drop', [True, False])378def test_set_index_drop(drop):379    pdf = pd.DataFrame({'A': list('ABAABBABAA'),380                        'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],381                        'C': [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]})382    ddf = dd.from_pandas(pdf, 3)383    assert_eq(ddf.set_index('A', drop=drop),384              pdf.set_index('A', drop=drop))385    assert_eq(ddf.set_index('B', drop=drop),386              pdf.set_index('B', drop=drop))387    assert_eq(ddf.set_index('C', drop=drop),388              pdf.set_index('C', drop=drop))389    assert_eq(ddf.set_index(ddf.A, drop=drop),390              pdf.set_index(pdf.A, drop=drop))391    assert_eq(ddf.set_index(ddf.B, drop=drop),392              pdf.set_index(pdf.B, drop=drop))393    assert_eq(ddf.set_index(ddf.C, drop=drop),394              pdf.set_index(pdf.C, drop=drop))395    # numeric columns396    pdf = pd.DataFrame({0: list('ABAABBABAA'),397                        1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],398                        2: [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]})399    ddf = dd.from_pandas(pdf, 3)400    assert_eq(ddf.set_index(0, drop=drop),401              pdf.set_index(0, drop=drop))402    assert_eq(ddf.set_index(2, drop=drop),403              pdf.set_index(2, drop=drop))404def test_set_index_raises_error_on_bad_input():405    df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],406                       'b': [7, 6, 5, 4, 3, 2, 1]})407    ddf = dd.from_pandas(df, 2)408    msg = r"Dask dataframe does not yet support multi-indexes"409    with pytest.raises(NotImplementedError) as err:410        ddf.set_index(['a', 'b'])411    assert msg in str(err.value)412def test_set_index_sorted_true():413    df = pd.DataFrame({'x': [1, 2, 3, 4],414                       'y': [10, 20, 30, 40],415                       'z': [4, 3, 2, 1]})416    a = dd.from_pandas(df, 2, sort=False)417    assert not a.known_divisions418    b = a.set_index('x', sorted=True)419    assert b.known_divisions420    assert set(a.dask).issubset(set(b.dask))421    for drop in [True, False]:422        assert_eq(a.set_index('x', drop=drop),423                  df.set_index('x', drop=drop))424        assert_eq(a.set_index(a.x, sorted=True, drop=drop),425                  df.set_index(df.x, drop=drop))426        assert_eq(a.set_index(a.x + 1, sorted=True, drop=drop),427                  df.set_index(df.x + 1, drop=drop))428    with pytest.raises(ValueError):429        a.set_index(a.z, sorted=True)430def test_set_index_sorted_single_partition():431    df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1, 0, 1, 0]})432    ddf = dd.from_pandas(df, npartitions=1)433    assert_eq(ddf.set_index('x', sorted=True),434              df.set_index('x'))435def test_set_index_sorted_min_max_same():436    a = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 0, 0]})437    b = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 1, 1]})438    aa = delayed(a)439    bb = delayed(b)440    df = dd.from_delayed([aa, bb], meta=a)441    assert not df.known_divisions442    df2 = df.set_index('y', sorted=True)443    assert df2.divisions == (0, 1, 1)444def test_compute_divisions():445    from dask.dataframe.shuffle import compute_divisions446    df = pd.DataFrame({'x': [1, 2, 3, 4],447                       'y': [10, 20, 30, 40],448                       'z': [4, 3, 2, 1]},449                      index=[1, 3, 10, 20])450    a = dd.from_pandas(df, 2, sort=False)451    assert not a.known_divisions452    divisions = compute_divisions(a)453    b = copy(a)454    b.divisions = divisions455    assert_eq(a, b, check_divisions=False)456    assert b.known_divisions457def test_temporary_directory(tmpdir):458    df = pd.DataFrame({'x': np.random.random(100),459                       'y': np.random.random(100),460                       'z': np.random.random(100)})461    ddf = dd.from_pandas(df, npartitions=10, name='x', sort=False)462    with dask.set_options(temporary_directory=str(tmpdir),463                          get=dask.multiprocessing.get):464        ddf2 = ddf.set_index('x', shuffle='disk')465        ddf2.compute()466        assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))467def test_empty_partitions():468    # See https://github.com/dask/dask/issues/2408469    df = pd.DataFrame({'a': list(range(10))})470    df['b'] = df['a'] % 3471    df['c'] = df['b'].astype(str)472    ddf = dd.from_pandas(df, npartitions=3)473    ddf = ddf.set_index('b')474    ddf = ddf.repartition(npartitions=3)475    ddf.get_partition(0).compute()476    assert_eq(ddf, df.set_index('b'))477    ddf = ddf.set_index('c')...test_set_index.py
Source:test_set_index.py  
...14                {"a": 4, "m": 12, "p": 21},15            ],16            columns=["a", "m", "p", "x"],17        )18        result = df.set_index(["a", "x"])19        expected = df[["m", "p"]]20        expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"])21        tm.assert_frame_equal(result, expected)22    def test_set_index_multiindexcolumns(self):23        columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)])24        df = DataFrame(np.random.randn(3, 3), columns=columns)25        result = df.set_index(df.columns[0])26        expected = df.iloc[:, 1:]27        expected.index = df.iloc[:, 0].values28        expected.index.names = [df.columns[0]]29        tm.assert_frame_equal(result, expected)30    def test_set_index_timezone(self):31        # GH#1235832        # tz-aware Series should retain the tz33        idx = DatetimeIndex(["2014-01-01 10:10:10"], tz="UTC").tz_convert("Europe/Rome")34        df = DataFrame({"A": idx})35        assert df.set_index(idx).index[0].hour == 1136        assert DatetimeIndex(Series(df.A))[0].hour == 1137        assert df.set_index(df.A).index[0].hour == 1138    def test_set_index_cast_datetimeindex(self):39        df = DataFrame(40            {41                "A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],42                "B": np.random.randn(1000),43            }44        )45        idf = df.set_index("A")46        assert isinstance(idf.index, DatetimeIndex)47    def test_set_index_dst(self):48        di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific")49        df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index()50        # single level51        res = df.set_index("index")52        exp = DataFrame(53            data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=Index(di, name="index")54        )55        tm.assert_frame_equal(res, exp)56        # GH#1292057        res = df.set_index(["index", "a"])58        exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"])59        exp = DataFrame({"b": [3, 4, 5]}, index=exp_index)60        tm.assert_frame_equal(res, exp)61    def test_set_index(self, float_string_frame):62        df = float_string_frame63        idx = Index(np.arange(len(df))[::-1])64        df = df.set_index(idx)65        tm.assert_index_equal(df.index, idx)66        with pytest.raises(ValueError, match="Length mismatch"):67            df.set_index(idx[::2])68    def test_set_index_names(self):69        df = tm.makeDataFrame()70        df.index.name = "name"71        assert df.set_index(df.index).index.names == ["name"]72        mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"])73        mi2 = MultiIndex.from_arrays(74            df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"]75        )76        df = df.set_index(["A", "B"])77        assert df.set_index(df.index).index.names == ["A", "B"]78        # Check that set_index isn't converting a MultiIndex into an Index79        assert isinstance(df.set_index(df.index).index, MultiIndex)80        # Check actual equality81        tm.assert_index_equal(df.set_index(df.index).index, mi)82        idx2 = df.index.rename(["C", "D"])83        # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather84        # than a pair of tuples85        assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)86        # Check equality87        tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)88    def test_set_index_cast(self):89        # issue casting an index then set_index90        df = DataFrame(91            {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2]}, index=[2010, 2011, 2012]92        )93        df2 = df.set_index(df.index.astype(np.int32))94        tm.assert_frame_equal(df, df2)95    # A has duplicate values, C does not96    @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])97    @pytest.mark.parametrize("inplace", [True, False])98    @pytest.mark.parametrize("drop", [True, False])99    def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys):100        df = frame_of_index_cols101        if isinstance(keys, list):102            idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys)103        else:104            idx = Index(df[keys], name=keys)105        expected = df.drop(keys, axis=1) if drop else df106        expected.index = idx107        if inplace:108            result = df.copy()109            return_value = result.set_index(keys, drop=drop, inplace=True)110            assert return_value is None111        else:112            result = df.set_index(keys, drop=drop)113        tm.assert_frame_equal(result, expected)114    # A has duplicate values, C does not115    @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])116    @pytest.mark.parametrize("drop", [True, False])117    def test_set_index_append(self, frame_of_index_cols, drop, keys):118        df = frame_of_index_cols119        keys = keys if isinstance(keys, list) else [keys]120        idx = MultiIndex.from_arrays(121            [df.index] + [df[x] for x in keys], names=[None] + keys122        )123        expected = df.drop(keys, axis=1) if drop else df.copy()124        expected.index = idx125        result = df.set_index(keys, drop=drop, append=True)126        tm.assert_frame_equal(result, expected)127    # A has duplicate values, C does not128    @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])129    @pytest.mark.parametrize("drop", [True, False])130    def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys):131        # append to existing multiindex132        df = frame_of_index_cols.set_index(["D"], drop=drop, append=True)133        keys = keys if isinstance(keys, list) else [keys]134        expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True)135        result = df.set_index(keys, drop=drop, append=True)136        tm.assert_frame_equal(result, expected)137    def test_set_index_after_mutation(self):138        # GH#1590139        df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]})140        expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key"))141        df2 = df.loc[df.index.map(lambda indx: indx >= 1)]142        result = df2.set_index("key")143        tm.assert_frame_equal(result, expected)144    # MultiIndex constructor does not work directly on Series -> lambda145    # Add list-of-list constructor because list is ambiguous -> lambda146    # also test index name if append=True (name is duplicate here for B)147    @pytest.mark.parametrize(148        "box",149        [150            Series,151            Index,152            np.array,153            list,154            lambda x: [list(x)],155            lambda x: MultiIndex.from_arrays([x]),156        ],157    )158    @pytest.mark.parametrize(159        "append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)]160    )161    @pytest.mark.parametrize("drop", [True, False])162    def test_set_index_pass_single_array(163        self, frame_of_index_cols, drop, append, index_name, box164    ):165        df = frame_of_index_cols166        df.index.name = index_name167        key = box(df["B"])168        if box == list:169            # list of strings gets interpreted as list of keys170            msg = "['one', 'two', 'three', 'one', 'two']"171            with pytest.raises(KeyError, match=msg):172                df.set_index(key, drop=drop, append=append)173        else:174            # np.array/list-of-list "forget" the name of B175            name_mi = getattr(key, "names", None)176            name = [getattr(key, "name", None)] if name_mi is None else name_mi177            result = df.set_index(key, drop=drop, append=append)178            # only valid column keys are dropped179            # since B is always passed as array above, nothing is dropped180            expected = df.set_index(["B"], drop=False, append=append)181            expected.index.names = [index_name] + name if append else name182            tm.assert_frame_equal(result, expected)183    # MultiIndex constructor does not work directly on Series -> lambda184    # also test index name if append=True (name is duplicate here for A & B)185    @pytest.mark.parametrize(186        "box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]187    )188    @pytest.mark.parametrize(189        "append, index_name",190        [(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)],191    )192    @pytest.mark.parametrize("drop", [True, False])193    def test_set_index_pass_arrays(194        self, frame_of_index_cols, drop, append, index_name, box195    ):196        df = frame_of_index_cols197        df.index.name = index_name198        keys = ["A", box(df["B"])]199        # np.array/list "forget" the name of B200        names = ["A", None if box in [np.array, list, tuple, iter] else "B"]201        result = df.set_index(keys, drop=drop, append=append)202        # only valid column keys are dropped203        # since B is always passed as array above, only A is dropped, if at all204        expected = df.set_index(["A", "B"], drop=False, append=append)205        expected = expected.drop("A", axis=1) if drop else expected206        expected.index.names = [index_name] + names if append else names207        tm.assert_frame_equal(result, expected)208    # MultiIndex constructor does not work directly on Series -> lambda209    # We also emulate a "constructor" for the label -> lambda210    # also test index name if append=True (name is duplicate here for A)211    @pytest.mark.parametrize(212        "box2",213        [214            Series,215            Index,216            np.array,217            list,218            iter,219            lambda x: MultiIndex.from_arrays([x]),220            lambda x: x.name,221        ],222    )223    @pytest.mark.parametrize(224        "box1",225        [226            Series,227            Index,228            np.array,229            list,230            iter,231            lambda x: MultiIndex.from_arrays([x]),232            lambda x: x.name,233        ],234    )235    @pytest.mark.parametrize(236        "append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)]237    )238    @pytest.mark.parametrize("drop", [True, False])239    def test_set_index_pass_arrays_duplicate(240        self, frame_of_index_cols, drop, append, index_name, box1, box2241    ):242        df = frame_of_index_cols243        df.index.name = index_name244        keys = [box1(df["A"]), box2(df["A"])]245        result = df.set_index(keys, drop=drop, append=append)246        # if either box is iter, it has been consumed; re-read247        keys = [box1(df["A"]), box2(df["A"])]248        # need to adapt first drop for case that both keys are 'A' --249        # cannot drop the same column twice;250        # plain == would give ambiguous Boolean error for containers251        first_drop = (252            False253            if (254                isinstance(keys[0], str)255                and keys[0] == "A"256                and isinstance(keys[1], str)257                and keys[1] == "A"258            )259            else drop260        )261        # to test against already-tested behaviour, we add sequentially,262        # hence second append always True; must wrap keys in list, otherwise263        # box = list would be interpreted as keys264        expected = df.set_index([keys[0]], drop=first_drop, append=append)265        expected = expected.set_index([keys[1]], drop=drop, append=True)266        tm.assert_frame_equal(result, expected)267    @pytest.mark.parametrize("append", [True, False])268    @pytest.mark.parametrize("drop", [True, False])269    def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append):270        df = frame_of_index_cols271        keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"])272        result = df.set_index(keys, drop=drop, append=append)273        # setting with a MultiIndex will never drop columns274        expected = df.set_index(["A", "B"], drop=False, append=append)275        tm.assert_frame_equal(result, expected)276    def test_construction_with_categorical_index(self):277        ci = tm.makeCategoricalIndex(10)278        ci.name = "B"279        # with Categorical280        df = DataFrame({"A": np.random.randn(10), "B": ci.values})281        idf = df.set_index("B")282        tm.assert_index_equal(idf.index, ci)283        # from a CategoricalIndex284        df = DataFrame({"A": np.random.randn(10), "B": ci})285        idf = df.set_index("B")286        tm.assert_index_equal(idf.index, ci)287        # round-trip288        idf = idf.reset_index().set_index("B")289        tm.assert_index_equal(idf.index, ci)290class TestSetIndexInvalid:291    def test_set_index_verify_integrity(self, frame_of_index_cols):292        df = frame_of_index_cols293        with pytest.raises(ValueError, match="Index has duplicate keys"):294            df.set_index("A", verify_integrity=True)295        # with MultiIndex296        with pytest.raises(ValueError, match="Index has duplicate keys"):297            df.set_index([df["A"], df["A"]], verify_integrity=True)298    @pytest.mark.parametrize("append", [True, False])299    @pytest.mark.parametrize("drop", [True, False])300    def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):301        df = frame_of_index_cols302        with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):303            # column names are A-E, as well as one tuple304            df.set_index(["foo", "bar", "baz"], drop=drop, append=append)305        # non-existent key in list with arrays306        with pytest.raises(KeyError, match="X"):307            df.set_index([df["A"], df["B"], "X"], drop=drop, append=append)308        msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"309        # tuples always raise KeyError310        with pytest.raises(KeyError, match=msg):311            df.set_index(tuple(df["A"]), drop=drop, append=append)312        # also within a list313        with pytest.raises(KeyError, match=msg):314            df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append)315    @pytest.mark.parametrize("append", [True, False])316    @pytest.mark.parametrize("drop", [True, False])317    @pytest.mark.parametrize("box", [set], ids=["set"])318    def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append):319        df = frame_of_index_cols320        msg = 'The parameter "keys" may be a column key, .*'321        # forbidden type, e.g. set322        with pytest.raises(TypeError, match=msg):323            df.set_index(box(df["A"]), drop=drop, append=append)324        # forbidden type in list, e.g. set325        with pytest.raises(TypeError, match=msg):326            df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append)327    # MultiIndex constructor does not work directly on Series -> lambda328    @pytest.mark.parametrize(329        "box",330        [Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])],331        ids=["Series", "Index", "np.array", "iter", "MultiIndex"],332    )333    @pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"])334    @pytest.mark.parametrize("append", [True, False])335    @pytest.mark.parametrize("drop", [True, False])336    def test_set_index_raise_on_len(337        self, frame_of_index_cols, box, length, drop, append338    ):339        # GH 24984340        df = frame_of_index_cols  # has length 5341        values = np.random.randint(0, 10, (length,))342        msg = "Length mismatch: Expected 5 rows, received array of length.*"343        # wrong length directly344        with pytest.raises(ValueError, match=msg):345            df.set_index(box(values), drop=drop, append=append)346        # wrong length in list347        with pytest.raises(ValueError, match=msg):348            df.set_index(["A", df.A, box(values)], drop=drop, append=append)349class TestSetIndexCustomLabelType:350    def test_set_index_custom_label_type(self):351        # GH#24969352        class Thing:353            def __init__(self, name, color):354                self.name = name355                self.color = color356            def __str__(self) -> str:357                return f"<Thing {repr(self.name)}>"358            # necessary for pretty KeyError359            __repr__ = __str__360        thing1 = Thing("One", "red")361        thing2 = Thing("Two", "blue")362        df = DataFrame({thing1: [0, 1], thing2: [2, 3]})363        expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))364        # use custom label directly365        result = df.set_index(thing2)366        tm.assert_frame_equal(result, expected)367        # custom label wrapped in list368        result = df.set_index([thing2])369        tm.assert_frame_equal(result, expected)370        # missing key371        thing3 = Thing("Three", "pink")372        msg = "<Thing 'Three'>"373        with pytest.raises(KeyError, match=msg):374            # missing label directly375            df.set_index(thing3)376        with pytest.raises(KeyError, match=msg):377            # missing label in list378            df.set_index([thing3])379    def test_set_index_custom_label_hashable_iterable(self):380        # GH#24969381        # actual example discussed in GH 24984 was e.g. for shapely.geometry382        # objects (e.g. a collection of Points) that can be both hashable and383        # iterable; using frozenset as a stand-in for testing here384        class Thing(frozenset):385            # need to stabilize repr for KeyError (due to random order in sets)386            def __repr__(self) -> str:387                tmp = sorted(self)388                joined_reprs = ", ".join(map(repr, tmp))389                # double curly brace prints one brace in format string390                return f"frozenset({{{joined_reprs}}})"391        thing1 = Thing(["One", "red"])392        thing2 = Thing(["Two", "blue"])393        df = DataFrame({thing1: [0, 1], thing2: [2, 3]})394        expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))395        # use custom label directly396        result = df.set_index(thing2)397        tm.assert_frame_equal(result, expected)398        # custom label wrapped in list399        result = df.set_index([thing2])400        tm.assert_frame_equal(result, expected)401        # missing key402        thing3 = Thing(["Three", "pink"])403        msg = r"frozenset\(\{'Three', 'pink'\}\)"404        with pytest.raises(KeyError, match=msg):405            # missing label directly406            df.set_index(thing3)407        with pytest.raises(KeyError, match=msg):408            # missing label in list409            df.set_index([thing3])410    def test_set_index_custom_label_type_raises(self):411        # GH#24969412        # purposefully inherit from something unhashable413        class Thing(set):414            def __init__(self, name, color):415                self.name = name416                self.color = color417            def __str__(self) -> str:418                return f"<Thing {repr(self.name)}>"419        thing1 = Thing("One", "red")420        thing2 = Thing("Two", "blue")421        df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])422        msg = 'The parameter "keys" may be a column key, .*'423        with pytest.raises(TypeError, match=msg):424            # use custom label directly425            df.set_index(thing2)426        with pytest.raises(TypeError, match=msg):427            # custom label wrapped in list...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
