Python练手,pandas
发表于:2025-12-03 作者:千家信息网编辑
千家信息网最后更新 2025年12月03日,'''http://pandas.pydata.org/pandas-docs/stable/10min.html numpy的主要数据结构是ndarry pandas的主要数据结构是Se
千家信息网最后更新 2025年12月03日Python练手,pandas
'''http://pandas.pydata.org/pandas-docs/stable/10min.html numpy的主要数据结构是ndarry pandas的主要数据结构是Series、DataFrame'''import pandas as pdimport numpy as npimport matplotlib.pyplot as plt df1 = pd.DataFrame(np.array(range(101,125)).reshape(6,4), index=range(6), columns=list('ABCD'))print(df1)# A B C D# 0 101 102 103 104# 1 105 106 107 108# 2 109 110 111 112# 3 113 114 115 116# 4 117 118 119 120# 5 121 122 123 124df2 = pd.DataFrame({'custID':['C0001','C0002','C0004','C0004','C0004','C0003'], 'accountID':pd.Series(['6214C000101', '6214C000201', '6214C000401', '6214C000403', '6214C000402', '6214C000301'],index=range(6),dtype='str'), 'tradeDate':pd.Series(['2018-01-18 14:00:00', '2018-01-18 14:00:00', '2018-01-18 14:00:01', '2018-01-18 14:00:03', '2018-01-18 14:00:02', '2018-01-18 14:00:00'],index=range(6),dtype='str'), 'tradeAmt':pd.Series([100.0, 100.0, 101.0, 103.0, 102.0, 100.0],index=range(6),dtype='float'), 'tradeDesc':'xxxxxx', 'mark':pd.Categorical(["row1","row2","row3","row4","row5","row6"])}, index=range(6)) #注意:表DateFrame与列Series的索引保持一致。DateFrame的真实index默认是从0开始的,这里设置的其实是index的标签,如果自定义了DateFrame的index(标签),假如某列是Series,那么Series的index也必须保持一致,否则会错位。print(df2)# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.dtypes)# accountID object# custID object# mark category# tradeAmt float64# tradeDate object# tradeDesc object# dtype: objectprint(df2.index)# RangeIndex(start=0, stop=6, step=1)print(df2.columns)# Index(['accountID', 'custID', 'mark', 'tradeAmt', 'tradeDate', 'tradeDesc'], dtype='object')print(df2.values)# [['6214C000101' 'C0001' 'row1' 100.0 '2018-01-18 14:00:00' 'xxxxxx']# ['6214C000201' 'C0002' 'row2' 100.0 '2018-01-18 14:00:00' 'xxxxxx']# ['6214C000401' 'C0004' 'row3' 101.0 '2018-01-18 14:00:01' 'xxxxxx']# ['6214C000403' 'C0004' 'row4' 103.0 '2018-01-18 14:00:03' 'xxxxxx']# ['6214C000402' 'C0004' 'row5' 102.0 '2018-01-18 14:00:02' 'xxxxxx']# ['6214C000301' 'C0003' 'row6' 100.0 '2018-01-18 14:00:00' 'xxxxxx']]print(df2.head(2))# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.tail(2))# accountID custID mark tradeAmt tradeDate tradeDesc# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.describe()) #统计,但仅限数值的列,非数值的列不会输出统计# tradeAmt# count 6.000000# mean 101.000000# std 1.264911# min 100.000000# 25% 100.000000# 50% 100.500000# 75% 101.750000# max 103.000000print(df2.T)# 0 1 2 \# accountID 6214C000101 6214C000201 6214C000401 # custID C0001 C0002 C0004 # mark row1 row2 row3 # tradeAmt 100 100 101 # tradeDate 2018-01-18 14:00:00 2018-01-18 14:00:00 2018-01-18 14:00:01 # tradeDesc xxxxxx xxxxxx xxxxxx # # 3 4 5 # accountID 6214C000403 6214C000402 6214C000301 # custID C0004 C0004 C0003 # mark row4 row5 row6 # tradeAmt 103 102 100 # tradeDate 2018-01-18 14:00:03 2018-01-18 14:00:02 2018-01-18 14:00:00 # tradeDesc xxxxxx xxxxxx xxxxxx print('------------------------------------------------------------------------------------')print(df2.sort_values(by='tradeDate',ascending=False)) #排序 按指定列的值 降序# accountID custID mark tradeAmt tradeDate tradeDesc# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.sort_values(by=['custID','tradeDate'],ascending=[True,False])) #联合排序# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxxprint(df2.sort_index(axis=0,ascending=False)) #索引排序 按照行的索引# accountID custID mark tradeAmt tradeDate tradeDesc# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.sort_index(axis=1,ascending=True)) #索引排序 按照列的索引(默认是按照列名生成的行索引)# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint('------------------------------------------------------------------------------------')''' iloc按索引查找,loc按标签查找 iat按索引查找,iat按标签查找'''print(df2['custID'])# 0 C0001# 1 C0002# 2 C0004# 3 C0004# 4 C0004# 5 C0003# Name: custID, dtype: objectprint(df2[0:4]) #切片 按行索引# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxxprint(df2[1:4]) #切片 按行索引# accountID custID mark tradeAmt tradeDate tradeDesc# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxxprint(df2.loc[1,'accountID']) #按行列标签查找,不是按行列索引查找# 6214C000201print(df2.iloc[3]) #第4行# accountID 6214C000403# custID C0004# mark row4# tradeAmt 103# tradeDate 2018-01-18 14:00:03# tradeDesc xxxxxx# Name: 3, dtype: objectprint(df2.iloc[3,4]) #第4行 第5列# 2018-01-18 14:00:03print(df2.iloc[3:4]) #第4至5行(不含第5行)# accountID custID mark tradeAmt tradeDate tradeDesc# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxxprint(df2.iloc[3:5,1:3]) #第4、5行,第2、3列(列索引如果没有自定义,是按列名排序自动生成的)# custID mark# 3 C0004 row4# 4 C0004 row5print(df2.iloc[[3,4],[1,2]]) #第4、5行,第2、3列# custID mark# 3 C0004 row4# 4 C0004 row5print(df2.iloc[3:5,:]) #第4、5行,所有列# accountID custID mark tradeAmt tradeDate tradeDesc# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxxprint(df2.iloc[:,1:3]) #所有行,第2、3列# custID mark# 0 C0001 row1# 1 C0002 row2# 2 C0004 row3# 3 C0004 row4# 4 C0004 row5# 5 C0003 row6print(df2[df2.tradeAmt > 101.0]) #筛选# accountID custID mark tradeAmt tradeDate tradeDesc# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxxprint('------------------------------------------------------------------------------------')df3 = df2.copy()df3["custID"] = ["NEW","NEW","NEW","NEW","NEW","NEW"] # 更新 整列df3.loc[:,'tradeAmt'] = range(len(df3)) #更新 按行列标签查找df3.at[range(7)[1],'accountID'] = '===========' # 更新 按行列标签查找df3.iat[0,0] = '+++++++++++' # 更新 按行列索引查找# df3[df3.tradeDate == '2018-01-18 14:00:03'] = -df3 #找出符合条件的行,然后取反,如果所有字段都是数值的话是可以的print(df3)# accountID custID mark tradeAmt tradeDate tradeDesc# 0 +++++++++++ NEW row1 0 2018-01-18 14:00:00 xxxxxx# 1 =========== NEW row2 1 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 NEW row3 2 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 NEW row4 3 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 NEW row5 4 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 NEW row6 5 2018-01-18 14:00:00 xxxxxxprint('------------------------------------------------------------------------------------')df4 = df2.reindex(index=range(4), columns=['custID','accountID','tradeAmt']) #重新组合 抽取df4.loc[0:1,'tradeAmt'] = 200 #如果该列存在,则更新df4.loc[0:1,'newColumn'] = 1 #如果该列不存在,则新增列print(df4)# custID accountID tradeAmt newColumn# 0 C0001 6214C000101 200.0 1.0# 1 C0002 6214C000201 200.0 1.0# 2 C0004 6214C000401 101.0 NaN# 3 C0004 6214C000403 103.0 NaNprint(df4.dropna(how='any')) #过滤所有包含空值的行# custID accountID tradeAmt newColumn# 0 C0001 6214C000101 200.0 1.0# 1 C0002 6214C000201 200.0 1.0print(df4.fillna(value=999)) #填充空值# custID accountID tradeAmt newColumn# 0 C0001 6214C000101 200.0 1.0# 1 C0002 6214C000201 200.0 1.0# 2 C0004 6214C000401 101.0 999.0# 3 C0004 6214C000403 103.0 999.0print(pd.isnull(df4)) #判断空值# custID accountID tradeAmt newColumn# 0 False False False False# 1 False False False False# 2 False False False True# 3 False False False Trueprint('------------------------------------------------------------------------------------')print(df2)# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.mean())# tradeAmt 101.0# dtype: float64s = pd.Series([1,3,5,np.nan,6,8], index=range(6)).shift(2) # 向后移动几行,前面置空print(s)# 0 NaN# 1 1.0# 2 3.0# 3 5.0# 4 NaN# 5 6.0# dtype: float64print(df2.shift(2))# accountID custID mark tradeAmt tradeDate tradeDesc# 0 NaN NaN NaN NaN NaN NaN# 1 NaN NaN NaN NaN NaN NaN# 2 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 3 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 4 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 5 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxxprint('------------------------------------------------------------------------------------')print(df2.apply(lambda x: max(x))) #列函数 lambda或者function都可以# accountID 6214C000403# custID C0004# mark row6# tradeAmt 103# tradeDate 2018-01-18 14:00:03# tradeDesc xxxxxx# dtype: objectprint('------------------------------------------------------------------------------------')print(df2["custID"].value_counts()) #类似 group by count # C0004 3# C0001 1# C0002 1# C0003 1# Name: custID, dtype: int64print('------------------------------------------------------------------------------------')print(df2["mark"].str.upper()) #大小写转换# 0 ROW1# 1 ROW2# 2 ROW3# 3 ROW4# 4 ROW5# 5 ROW6# Name: mark, dtype: objectprint('------------------------------------------------------------------------------------')df5 = pd.DataFrame(np.random.randn(9,3))print(df5)# 0 1 2# 0 1.303158 -0.125934 -0.205285# 1 0.760388 -1.004298 1.143800# 2 2.063722 0.229955 0.020368# 3 -2.024974 0.307957 -0.579090# 4 -1.571883 0.260561 -0.884209# 5 2.465572 -1.001873 1.243028# 6 0.025388 -0.372608 1.431214# 7 -0.079416 -0.401075 -0.973337# 8 -1.088755 -1.947188 -1.100827pieces = [df5[:2],df5[5:6],df5[7:]] #头、中间、尾,切几块拼起来print(pieces)# [ 0 1 2# 0 1.303158 -0.125934 -0.205285# 1 0.760388 -1.004298 1.143800, 0 1 2# 5 2.465572 -1.001873 1.243028, 0 1 2 #index重复打印了几次# 7 -0.079416 -0.401075 -0.973337# 8 -1.088755 -1.947188 -1.100827]print(pd.concat(pieces)) #包含# 0 1 2# 0 1.303158 -0.125934 -0.205285# 1 0.760388 -1.004298 1.143800# 5 2.465572 -1.001873 1.243028# 7 -0.079416 -0.401075 -0.973337# 8 -1.088755 -1.947188 -1.100827print('------------------------------------------------------------------------------------')df_left = pd.DataFrame({'key':['001','002','007'],'val':['999','1','2']})df_right = pd.DataFrame({'key':['001','002','009'],'key2':['001','002','009'],'val':['999','3','4']})print(df_left)# key val# 0 001 999# 1 002 1# 2 007 2print(df_right)# key key2 val# 0 001 001 999# 1 002 002 3# 2 009 009 4print( pd.merge(df_left, df_right,how='inner', on='key') ) #内关联# key val_x key2 val_y# 0 001 999 001 999# 1 002 1 002 3print( pd.merge(df_left, df_right, how='inner', left_on='key',right_on='key2') ) #内关联 不同字段# key_x val_x key_y key2 val_y# 0 001 999 001 001 999# 1 002 1 002 002 3print( pd.merge(df_left, df_right,how='inner', on=['key','val']) ) #内关联 多字段# key val key2# 0 001 999 001print( pd.merge(df_left, df_right, how='left', on='key') ) #左外关联# key val_x key2 val_y# 0 001 999 001 999# 1 002 1 002 3# 2 007 2 NaN NaNprint( pd.merge(df_left, df_right, how='right', on='key') ) #右外关联# key val_x key2 val_y# 0 001 999 001 999# 1 002 1 002 3# 2 009 NaN 009 4print('------------------------------------------------------------------------------------')print(df2.append(df2[:3],ignore_index=True)) #对原表做行切片,再追加到原表,追加的时候忽略切片的索引标签,索引自动重新编排标签# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx# 6 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)# 7 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)# 8 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx (这行是追加的)print(df2.append(df2[:3],ignore_index=False)) #追加之后,保留切片的索引标签,发现了吗,索引标签是允许重复的# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx (这行是追加的)print('------------------------------------------------------------------------------------')tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', #zip()函数,将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) # 多索引标签MultiIndexdf6 = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])print(df6)# A B# first second # bar one -0.101234 -0.956210# two -0.480354 1.308950# baz one 0.943706 0.976480# two -0.788852 -1.556547# foo one 0.997527 -0.337391# two -0.191448 -0.083129# qux one -0.919527 -0.414051# two -0.579727 1.595290stacked = df6.stack() # 把"行列表结构"变成"堆栈结构"(姑且这样称呼它),把列标签追加到行标签之后print(stacked) # first second # bar one A -0.101234# B -0.956210# two A -0.480354# B 1.308950# baz one A 0.943706# B 0.976480# two A -0.788852# B -1.556547# foo one A 0.997527# B -0.337391# two A -0.191448# B -0.083129# qux one A -0.919527# B -0.414051# two A -0.579727# B 1.595290print(stacked["bar"]["one"]["A"]) # "堆栈结构"的好处是,你可以这样访问数据,可以想象"堆栈结构"其实就是多层数组# dtype: float64# -0.101233870095unstacked = stacked.unstack() # 还原回去,把"堆栈结构"变成"行列表结构",把行标签变成列print(unstacked)# A B# first second # bar one -0.101234 -0.956210# two -0.480354 1.308950# baz one 0.943706 0.976480# two -0.788852 -1.556547# foo one 0.997527 -0.337391# two -0.191448 -0.083129# qux one -0.919527 -0.414051# two -0.579727 1.595290unstacked_unstacked_0 = unstacked.unstack(0) #还能继续吧行标签变成列标签print(unstacked_unstacked_0)# A B # first bar baz foo qux bar baz foo qux # second # one -0.101234 0.943706 0.997527 -0.919527 -0.95621 0.976480 -0.337391 -0.414051 # two -0.480354 -0.788852 -0.191448 -0.579727 1.30895 -1.556547 -0.083129 1.595290 unstacked_unstacked_1 = unstacked.unstack(1) #还能继续吧行标签变成列标签 把第2个标签变成列标签print(unstacked_unstacked_1)# A B # second one two one two# first # bar -0.101234 -0.480354 -0.956210 1.308950# baz 0.943706 -0.788852 0.976480 -1.556547# foo 0.997527 -0.191448 -0.337391 -0.083129# qux -0.919527 -0.579727 -0.414051 1.595290print('------------------------------------------------------------------------------------')df7 = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)})print(df7)# A B C D E# 0 one A foo -0.516297 -0.860641# 1 one B foo -1.560483 -1.647366# 2 two C foo 1.124756 0.329971# 3 three A bar -0.312954 0.040263# 4 one B bar -1.355079 0.358829# 5 one C bar 0.749617 0.978513# 6 two A foo -2.173830 0.434789# 7 three B foo -1.070213 0.641253# 8 one C foo -0.515032 0.127273# 9 one A bar -1.408970 0.025128# 10 two B bar -0.390044 0.060392# 11 three C bar 0.067667 0.676595print( pd.pivot_table(df7, values='D', index=['A', 'B'], columns=['C']) ) #透视表# C bar foo# A B # one A -1.408970 -0.516297# B -1.355079 -1.560483# C 0.749617 -0.515032# three A -0.312954 NaN# B NaN -1.070213# C 0.067667 NaN# two A NaN -2.173830# B -0.390044 NaN# C NaN 1.124756print('------------------------------------------------------------------------------------')rng = pd.date_range('1/1/2012', periods=10, freq='min') #看结果,是个时间索引DatetimeIndexprint(rng)# DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:01:00',# '2012-01-01 00:02:00', '2012-01-01 00:03:00',# '2012-01-01 00:04:00', '2012-01-01 00:05:00',# '2012-01-01 00:06:00', '2012-01-01 00:07:00',# '2012-01-01 00:08:00', '2012-01-01 00:09:00'],# dtype='datetime64[ns]', freq='T')ts = pd.Series(range(10), index=rng) # 时间序列数据print(ts)# 2012-01-01 00:00:00 0# 2012-01-01 00:01:00 1# 2012-01-01 00:02:00 2# 2012-01-01 00:03:00 3# 2012-01-01 00:04:00 4# 2012-01-01 00:05:00 5# 2012-01-01 00:06:00 6# 2012-01-01 00:07:00 7# 2012-01-01 00:08:00 8# 2012-01-01 00:09:00 9# Freq: T, dtype: int32print( ts.resample('5Min').sum() ) #resample()是对时间序列数据进行重新采样的便捷方法# 2012-01-01 00:00:00 10# 2012-01-01 00:05:00 35# Freq: 5T, dtype: int32ts_utc = ts.tz_localize('UTC') #改变时区标准 UTC世界时 GMT格里尼治时print( ts_utc )# 2012-01-01 00:00:00+00:00 0# 2012-01-01 00:01:00+00:00 1# 2012-01-01 00:02:00+00:00 2# 2012-01-01 00:03:00+00:00 3# 2012-01-01 00:04:00+00:00 4# 2012-01-01 00:05:00+00:00 5# 2012-01-01 00:06:00+00:00 6# 2012-01-01 00:07:00+00:00 7# 2012-01-01 00:08:00+00:00 8# 2012-01-01 00:09:00+00:00 9# Freq: T, dtype: int32print( ts_utc.tz_convert('US/Eastern') ) #时区转换# 2011-12-31 19:00:00-05:00 0# 2011-12-31 19:01:00-05:00 1# 2011-12-31 19:02:00-05:00 2# 2011-12-31 19:03:00-05:00 3# 2011-12-31 19:04:00-05:00 4# 2011-12-31 19:05:00-05:00 5# 2011-12-31 19:06:00-05:00 6# 2011-12-31 19:07:00-05:00 7# 2011-12-31 19:08:00-05:00 8# 2011-12-31 19:09:00-05:00 9# Freq: T, dtype: int32print( ts.to_period() ) #时间序列显示格式,只显示到你定义的单位 # 2012-01-01 00:00 0# 2012-01-01 00:01 1# 2012-01-01 00:02 2# 2012-01-01 00:03 3# 2012-01-01 00:04 4# 2012-01-01 00:05 5# 2012-01-01 00:06 6# 2012-01-01 00:07 7# 2012-01-01 00:08 8# 2012-01-01 00:09 9# Freq: T, dtype: int32print( ts.to_period().to_timestamp() ) #时间序列显示格式,标准时间格式# 2012-01-01 00:00:00 0# 2012-01-01 00:01:00 1# 2012-01-01 00:02:00 2# 2012-01-01 00:03:00 3# 2012-01-01 00:04:00 4# 2012-01-01 00:05:00 5# 2012-01-01 00:06:00 6# 2012-01-01 00:07:00 7# 2012-01-01 00:08:00 8# 2012-01-01 00:09:00 9# Freq: T, dtype: int32print('------------------------------------------------------------------------------------')df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})df["grade"] = df["raw_grade"].astype("category") #创建新的列,支持category类型数据(category是一种类别标签)print( df["grade"] )# 0 a# 1 b# 2 b# 3 a# 4 a# 5 e# Name: grade, dtype: categorydf["grade"].cat.categories = ["very good", "good", "very bad"]df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) #重新定义类别,覆盖原来的类别print( df["grade"] )# 0 very good# 1 good# 2 good# 3 very good# 4 very good# 5 very bad# Name: grade, dtype: category# Categories (5, object): [very bad, bad, medium, good, very good]print( df.groupby("grade").size() ) #按类别统计# grade# very bad 1# bad 0# medium 0# good 2# very good 3# dtype: int64print('------------------------------------------------------------------------------------')ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) #1000日的时间序列+随机数ts = ts.cumsum() #累加统计print(ts)ts.plot() #有的环境到这步就显式了plt.show() #有的要导入matplotlib.pyplot模块,这样开启图像显示#图像是一条曲线,X轴:1000日,y轴:每日的累加统计结果df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,columns=['A', 'B', 'C', 'D']) #时间序列的索引标签,4列的表df = df.cumsum() #每列的累加统计df.plot()plt.show()#图像是4条曲线,X轴:1000日,y轴:每日的累加统计结果
标签
索引
结构
时间
统计
序列
数据
时间序列
行列
关联
排序
更新
堆栈
类别
字段
数值
格式
结果
一致
函数
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
国信互联网科技大楼
国家网络安全周宣传设计
兰陵县网络安全
公司数据库审计制度
海南 网络安全 招聘
杭州服务器淘汰公司哪家好
软件开发苏州就业
郑州教育软件开发公司
序列号算法软件开发
搬家服务软件开发去哪找
ipmi服务器管理系统
多级分销系统软件开发
潼南区一站式软件开发流程市场价
java软件开发工程是什么
cmd进入数据库
那些专科有软件开发
2018年度网络安全自查
雨村服务器
好的大数据网络安全
fifa数据库
软件开发算在建工程吗
公安网络安全管理局
计算机网络技术的优劣分析
昆明做软件开发
软件开发瀑布模型例子
c 中更新数据库语句怎么写
武汉恒垒空间网络技术有限公司
网络安全监测及应急处置
三级教程网络技术题库
贵阳市网络安全宣传周启动仪式