分组级运算和转换

假设要添加一列的各索引分组平均值

第一种方法
import pandas as pd
from pandas import Series
import numpy as np

df = pd.DataFrame([[-2.04708,1.393406,'a','one'],
              [0.478943,0.092908,'a','two'],
              [-0.519439,0.281746,'b','one'],
              [-0.555730,0.769023,'b','two'],
              [1.965781,1.246435,'a','one'],
             ], columns=['data1','data2','key1','key2'])
df

        data1     data2    key1 key2
0   -2.047080   1.393406    a   one
1   0.478943    0.092908    a   two
2   -0.519439   0.281746    b   one
3   -0.555730   0.769023    b   two
4   1.965781    1.246435    a   one

# 先聚合求出平均值
key1_means = df.groupby('key1').mean().add_prefix('mean_')
key1_means

       mean_data1  mean_data2
key1
a      0.132548   0.910916
b     -0.537584   0.525385

# 在通过聚合函数加到DataFrame
pd.merge(df, key1_means,left_on='key1', right_index=True)

       data1      data2   key1  key2    mean_data1  mean_data2
0   -2.047080   1.393406    a   one      0.132548   0.910916
1   0.478943    0.092908    a   two      0.132548   0.910916
4   1.965781    1.246435    a   one      0.132548   0.910916
2   -0.519439   0.281746    b   one     -0.537584   0.525385
3   -0.555730   0.769023    b   two     -0.537584   0.525385
第二种方法 transform,会将一个函数应用到各个分组,有严格条件,要么传入可以广播的标量,要么产生一个相同大小的结果数组
df_mean = df.groupby('key2').transform(np.mean).add_prefix('mean_')
df_mean

    mean_data1  mean_data2
0   -0.200246   0.973862
1   -0.038393   0.430966
2   -0.200246   0.973862
3   -0.038393   0.430966
4   -0.200246   0.973862

pd.concat([df,df_mean],axis=1)

      data1       data2   key1  key2  data1      data2
0   -2.047080   1.393406    a   one -0.200246   0.973862
1   0.478943    0.092908    a   two -0.038393   0.430966
2   -0.519439   0.281746    b   one -0.200246   0.973862
3   -0.555730   0.769023    b   two -0.038393   0.430966
4   1.965781    1.246435    a   one -0.200246   0.973862

apply一般性的'拆分-应用-合并'

apply会将待处理的对象拆分成多个片段,然后对各片段调用传入的函数,最后尝试将各片段组合到一起

# 选取指定列具有最大值的行的函数
def top(df, n=3, column='tip_pct'):
    return df.sort_index(by=column)[-n:]

tips = pd.read_csv('C:/Users/1/Desktop/tips.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips.head()

 total_bill tip       sex   smoker  day  time   size    tip_pct
0   16.99    1.01   Female   No     Sun Dinner  2      0.059447
1   10.34    1.66   Male     No     Sun Dinner  3      0.160542
2   21.01    3.50   Male     No     Sun Dinner  3      0.166587
3   23.68    3.31   Male     No     Sun Dinner  2      0.139780
4   24.59    3.61   Female   No     Sun Dinner  4      0.146808

# 选取前三个最大值
top(tips,n=3)

total_bill  tip      sex    smoker  day time    size    tip_pct
67  3.07    1.00    Female  Yes     Sat Dinner   1     0.325733
178 9.60    4.00    Female  Yes     Sun Dinner   2     0.416667
172 7.25    5.15    Male    Yes     Sun Dinner   2     0.710345

# 按是否吸烟分组,选前三个最大的值
# 过程是top函数在各个片段上调用后,结果由pandas.concat组装到一起
tips.groupby('smoker').apply(top)

             total_bill tip      sex  smoker  day   time    size     tip_pct
smoker
No     51       10.29   2.60    Female  No    Sun   Dinner   2       0.252672
       149      7.51    2.00    Male    No    Thur  Lunch    2       0.266312
       232      11.61   3.39    Male    No    Sat   Dinner   2       0.291990
Yes    67       3.07    1.00    Female  Yes   Sat   Dinner   1       0.325733
       178      9.60    4.00    Female  Yes   Sun   Dinner   2       0.416667
       172      7.25    5.15    Male    Yes   Sun   Dinner   2       0.710345

# 如果传给apply的函数能够接受其他参数或关键字,则可以将这些一并传入
# 总花费的钱,按是否吸烟和每周的天数来找出每天其中价格最高的,n代表返回的数据前几个
tips.groupby(['smoker','day']).apply(top, n=1, column='total_bill')

              total_bill    tip      sex    smoker     day  time    size    tip_pct
smoker day
No     Fri  94  22.75       3.25    Female  No         Fri  Dinner   2      0.142857
       Sat  212 48.33       9.00    Male    No         Sat  Dinner   4      0.186220
       Sun  156 48.17       5.00    Male    No         Sun  Dinner   6      0.103799
       Thur 142 41.19       5.00    Male    No         Thur Lunch    5      0.121389
Yes    Fri  95  40.17       4.73    Male    Yes        Fri  Dinner   4      0.117750
       Sat  170 50.81      10.00    Male    Yes        Sat  Dinner   3      0.196812
       Sun  182 45.35       3.50    Male    Yes        Sun  Dinner   3      0.077178
       Thur 197 43.11       5.00    Female  Yes        Thur Lunch    4      0.115982

# 分组调用describe的方法
tips.groupby('smoker')['tip_pct'].describe().T

smoker     No          Yes
count   151.000000  93.000000
mean    0.159328    0.163196
std     0.039910    0.085119
min     0.056797    0.035638
25%     0.136906    0.106771
50%     0.155625    0.153846
75%     0.185014    0.195059
max     0.291990    0.710345

# 本质是,下面两行代码的快捷键而已
f = lambda x:x.describe()
tips.groupby('smoker')['tip_pct'].apply(f).unstack('smoker')

smoker      No        Yes
count   151.000000  93.000000
mean    0.159328    0.163196
std     0.039910    0.085119
min     0.056797    0.035638
25%     0.136906    0.106771
50%     0.155625    0.153846
75%     0.185014    0.195059
max     0.291990    0.710345

# 禁用层次化索引
tips.groupby('smoker',group_keys=False).apply(top)

    total_bill  tip      sex    smoker  day     time      size   tip_pct
51     10.29    2.60    Female    No    Sun    Dinner      2    0.252672
149     7.51    2.00    Male      No    Thur   Lunch       2    0.266312
232    11.61    3.39    Male      No    Sat    Dinner      2    0.291990
67     3.07     1.00    Female    Yes   Sat    Dinner      1    0.325733
178    9.60     4.00    Female    Yes   Sun    Dinner      2    0.416667
172    7.25     5.15    Male      Yes   Sun    Dinner      2    0.710345

分位数和桶分析


frame = pd.DataFrame({'data1':np.random.randn(1000),
                      'data2':np.random.randn(1000)})
factor = pd.cut(frame['data1'],4)
factor[:5]

0    (-1.573, 0.112]
1    (-1.573, 0.112]
2    (-1.573, 0.112]
3    (-1.573, 0.112]
4    (-1.573, 0.112]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.264, -1.573] < (-1.573, 0.112] < (0.112, 1.797] < (1.797, 3.482]]

def get_stats(group):
    return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}

# 长度即每个区间相等的桶(区间大小相等)
frame.data2.groupby(factor).apply(get_stats).unstack()

                 count    max         mean          min
data1
(-3.264, -1.573]  57.0  3.236024    0.100749    -2.149984
(-1.573, 0.112]   484.0 2.843239    -0.058549   -3.606913
(0.112, 1.797]    425.0 2.614935    0.065693    -3.463799
(1.797, 3.482]    34.0  1.791511    -0.049641   -1.756306

# 大小相等的桶,labels关闭区间名称(数据点数量相等)
ppp = pd.qcut(frame['data1'],4,labels=False)
frame.data2.groupby(ppp).apply(get_stats).unstack()

    count      max        mean         min
data1
0   250.0   3.236024    -0.032592   -2.750112
1   250.0   2.843239    -0.068005   -3.606913
2   250.0   2.614935    0.103220    -2.380858
3   250.0   2.612170    0.011922    -3.463799