>>> df
one two three
a -1.101558 1.124472 NaN
b -0.177289 2.487104 -0.634293
c 0.462215 -0.486066 1.931194
d NaN -0.456288 -1.222918
pandas包含丰富的函数来对数据进行统计分析。
mean函数
>>> df.mean(0) #对DataFrame的每列求平均数,axis=0
one -0.272211
two 0.667306
three 0.024661
dtype: float64
>>> df.mean(1) #对DataFrame的每行求平均数,axis=1
a 0.011457
b 0.558507
c 0.635781
d -0.839603
dtype: float64
pandas中的axis
我们来分析下pandas中的axis,它究竟是代表行还是列呢?以上的mean的参数axis=1时,但在第一节中的drop函数时却删除了一列,那么到底axis=1代表的是行还是列呢?df.mean其实是在每一行上取所有列的均值,而不是保留每一列的均值。也许简单的来记就是axis=0代表往跨行(down),而axis=1代表跨列(across),即当axis=0时表示沿着每一列或行索引值向下执行方法;当axis=1时表示沿着每一行或列索引值模向执行对应的方法。下图形象解释了axis的含义。
sum函数
>>> df.sum(0, skipna=False) # 按列求和,axis=0,skipna参数表示是否排除缺失值,默认为True
one NaN
two 2.669223
three NaN
dtype: float64
>>> df.sum(axis=1) # 按列求和,axis=0
a 0.022914
b 1.675522
c 1.907343
d -1.679206
dtype: float64
pandas常见的统计分析函数
3、函数应用apply函数可以将一个函数应用在DataFrame的某个轴上。
DataFrame.apply(func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds)
>>> df.apply(np.mean, axis=1)# 等同于df.apply(‘mean’, axis=1)
a 0.011457
b 0.558507
c 0.635781
d -0.839603
dtype: float64
>>> df.apply(lambda x: x.max() - x.min())
one 1.563773
two 2.973170
three 3.154112
dtype: float64
元素级函数应用可以使用applymap()。
在pandas V0.20.0版本中新出现一个函数agg()(DataFrame.aggregate()的简版),agg()与apply()不同的是,其在指定的轴上可以使用一个或多个函数对数据进行聚合操作。
DataFrame.agg(func, axis=0, *args, **kwargs)
>>> tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'],
.....: index=pd.date_range('1/1/2000', periods=10))
>>> tsdf
A B C
2000-01-01 0.170247 -0.916844 0.835024
2000-01-02 1.259919 0.801111 0.445614
2000-01-03 1.453046 2.430373 0.653093
2000-01-04 NaN NaN NaN
2000-01-05 NaN NaN NaN
2000-01-06 NaN NaN NaN
2000-01-07 NaN NaN NaN
2000-01-08 -1.874526 0.569822 -0.609644
2000-01-09 0.812462 0.565894 -1.461363
2000-01-10 -0.985475 1.388154 -0.078747
>>> tsdf.agg(np.sum) # 一个函数的时候等同于tsdf.apply(np.sum)
A 0.835673
B 4.838510
C -0.216025
dtype: float64
>>> tsdf.agg(['sum', 'mean’]) # 对多个函数进行聚合操作
A B C
sum 0.835673 4.838510 -0.216025
mean 0.139279 0.806418 -0.036004
还有一个函数应用transform(),与agg()函数很像。
DataFrame.transform(func, *args, **kwargs)
>>> tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'],
.....: index=pd.date_range('1/1/2000', periods=10))
>>> tsdf
A B C
2000-01-01 -0.578465 -0.503335 -0.987140
2000-01-02 -0.767147 -0.266046 1.083797
2000-01-03 0.195348 0.722247 -0.894537
2000-01-04 NaN NaN NaN
2000-01-05 NaN NaN NaN
2000-01-06 NaN NaN NaN
2000-01-07 NaN NaN NaN
2000-01-08 -0.556397 0.542165 -0.308675
2000-01-09 -1.010924 -0.672504 -1.139222
2000-01-10 0.354653 0.563622 -0.365106
>>> tsdf.transform([np.abs, lambda x: x+1])
A B C
absolute <lambda> absolute <lambda> absolute <lambda>
2000-01-01 0.578465 0.421535 0.503335 0.496665 0.987140 0.012860
2000-01-02 0.767147 0.232853 0.266046 0.733954 1.083797 2.083797
2000-01-03 0.195348 1.195348 0.722247 1.722247 0.894537 0.105463
2000-01-04 NaN NaN NaN NaN NaN NaN
2000-01-05 NaN NaN NaN NaN NaN NaN
2000-01-06 NaN NaN NaN NaN NaN NaN
2000-01-07 NaN NaN NaN NaN NaN NaN
2000-01-08 0.556397 0.443603 0.542165 1.542165 0.308675 0.691325
2000-01-09 1.010924 -0.010924 0.672504 0.327496 1.139222 -0.139222
2000-01-10 0.354653 1.354653 0.563622 1.563622 0.365106 0.634894
4、排序
根据设置的条件对数据集进行排序(sorting),是pandas的一个重要的内置运算,sort_index可以对行或列索引进行排序,并返回一个已排序的新对象。
Series.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True)
DataFrame.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None)
>>> df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
.....: 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
.....: 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
>>> unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
.....: columns=['three', 'two', 'one'])
.....:
>>> unsorted_df
three two one
a NaN 0.708543 0.036274
d -0.540166 0.586626 NaN
c 0.410238 1.121731 1.044630
b -0.282532 -2.038777 -0.490032
>>> unsorted_df.sort_index() #axis=0,对行轴的索引进行排序
three two one
a NaN 0.708543 0.036274
b -0.282532 -2.038777 -0.490032
c 0.410238 1.121731 1.044630
d -0.540166 0.586626 NaN
>>> unsorted_df.sort_index(axis=1)
one three two
a 0.036274 NaN 0.708543
d NaN -0.540166 0.586626
c 1.044630 0.410238 1.121731
b -0.490032 -0.282532 -2.038777
数据默认是按升序排序,可以通过设置参数ascending=False进行降序排序。
>>> unsorted_df.sort_index(ascending=False)
three two one
d -0.540166 0.586626 NaN
c 0.410238 1.121731 1.044630
b -0.282532 -2.038777 -0.490032
a NaN 0.708543 0.036274
按元素值进行排序
>>> df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]})
>>> df1.sort_values(by='two')
one two three
0 2 1 5
2 1 2 3
1 1 3 4
3 1 4 2
对多个列进行排序
>>> df1[['one', 'two', 'three']].sort_values(by=['one','two’]) # 在one列有序的基础上再对two列进行排序
one two three
2 1 2 3
1 1 3 4
3 1 4 2
0 2 1 5
5、层次化索引(hierarchical indexing)
层次化索引是pandas的一个重要功能,层次化索引可以在一个轴上有多个索引级别,并以低维度形式处理高纬度数据。
>>> arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
....: np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
>>> s = pd.Series(np.random.randn(8), index=arrays)
>>> s
bar one -0.861849
two -2.104569
baz one -0.494929
two 1.071804
foo one 0.721555
two -0.706771
qux one -1.039575
two 0.271860
dtype: float64
>>> df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
>>> df
first bar baz foo qux
second one two one two one two one two
A 0.895717 0.805244 -1.206412 2.565646 1.431256 1.340309 -1.170299 -0.226169
B 0.410835 0.813850 0.132003 -0.827317 -0.076467 -1.187678 1.130127 -1.436737
C -1.413681 1.607920 1.024180 0.569605 0.875906 -2.211372 0.974466 -2.006747
索引方式
>>> df['bar']
second one two
A 0.895717 0.805244
B 0.410835 0.813850
C -1.413681 1.607920
>>> df['bar', 'one']
A 0.895717
B 0.410835
C -1.413681
Name: (bar, one), dtype: float64
>>> df.xs('one', level='second', axis=1)
first bar baz foo qux
A 0.895717 -1.206412 1.431256 -1.170299
B 0.410835 0.132003 -0.076467 1.130127
C -1.413681 1.024180 0.875906 0.974466
>>> df.xs(('one', 'bar'), level=('second', 'first'), axis=1)
first bar
second one
A 0.895717
B 0.410835
C -1.413681
参考
pandas API文档:http://pandas.pydata.org/pandas-docs/stable/index.html
共同学习,写下你的评论
评论加载中...
作者其他优质文章