Pandas基础
一、Series
Series讲解
from pandas import Series,DataFrame
import pandas as pd
obj = Series([1, -2, 3, -4])
type(obj)
obj2 = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])
obj2.values
obj2.index
obj2[2]
obj2[['c']]
obj2['c'] = 23
obj2[obj2 < 0 ]
obj2 * 2
np.abs(obj2)
data = {
'张三':92,
'李四':78,
'王五':68,
'小明':82
}
obj3 = Series(data)
obj4 = Series(data, index=names)
obj4.name = 'math'
obj4.index.name = 'students'
二、DataFrame
import numpy as np
from pandas import Series,DataFrame
import pandas as pd
data = {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'female', 'male', 'male'],
'year':[2001, 2001, 2003, 2002],
'city':['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df = DataFrame(data, columns=['name', 'sex', 'year', 'city'])
df = DataFrame(data, columns=['name', 'sex', 'year', 'city'],index=['a', 'b', 'c', 'd'])
'sex' in df.columns
'f' in df.index
三、索引值
obj = Series([1, -2, 3, -4], index=['b', 'a', 'c', 'd'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2 = obj.reindex(range(6),method='ffill')
df2 = df.reindex(['a', 'b', 'c', 'd'],fill_value=0.0)
df3 = df2.reset_index(drop=True)
df2 = df.set_index('name')
四、索引和选取
obj[['a','c']]
obj['a':'c']
df[['city','sex']]
df2.loc['张三']
df2.iloc[1]
df2[(df2['sex'] == 'female') & (df2['city'] == '北京')]
五、行和列的操作
new_data = {
'city':'武汉',
'name':'小李',
'sex':'male',
'year':2002
}
df = df.append(new_data,ignore_index=True)
new_df = df.drop(2,axis=0)
new_df = new_df.drop('class',axis=1)
new_df.rename(index={3:2,4:3},columns={'math':'MATH'},inplace=True)
obj1.sort_index()
obj1.sort_index(ascending=False)
obj1.sort_values(ascending=False)
df2.sort_values(by='b',ascending=False)
df.describe()
obj.unique()
obj.value_counts()
obj = Series(np.random.randn(9),
index=[['one','one','one','two','two','two','three','three','three'],
['a','b','c','a','b','c','a','b','c']])
obj[:,'a']
df = DataFrame(np.arange(16).reshape(4,4),
index=[['one','one','two','two'],['a','b','a','b']],
columns=[['apple','apple','orange','orange'],['red','green','red','green']])
df.swaplevel(0,1)
df.sum(level=0)
df.sum(level=1,axis=1)
六、pandas数据可视化
1.使用series绘制线性图
import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
s = Series(np.random.normal(size=10),index=['a','b','c','d','e','f','g','h','i','j'])
s.plot()
plt.show()
2.使用DataFrame绘制线型图
df = DataFrame({'normal': np.random.normal(size=100),
'gamma': np.random.gamma(1, size=100),
'poisson': np.random.poisson(size=100)})
df.describe()
df.plot()
plt.show()
df['sex'].value_counts()
df['sex'].value_counts().plot(kind='barh')
3.使用series绘制柱状图
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
fig,axes = plt.subplots(2,1)
df = pd.Series(np.random.rand(16),index = list('abcdefgijkpolikj'))
df.plot.bar(ax=axes[0],color='r',alpha=0.7)
df.plot.barh(ax=axes[1],color='r',alpha=0.7)
plt.show()
4.使用DataFrame绘制柱状图
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.rand(4,4),index = ['one','two','three','four'],columns = pd.Index(['A','B','C','D'],name='bar'))
df.plot.bar()
plt.show()
5.使用DataFrame的hist方法生成直方图
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({'a':np.random.randn(1000),'b':np.random.randn(1000),},columns=['a','b'])
df.plot.hist(bins=20)
plt.show()
七、pandas文件操作
data=pd.read_csv('05_Regression_5.2_logreg_credit_scores.csv',sep=';')
data=pd.read_table('iris.data',sep=',',header=None)
data.head(6)
data.to_csv('iris.csv')
八、pandas数据清洗与整理
1.数据清洗
df1 = DataFrame([[3,5,3],[1,6,np.nan],
['lili',np.nan,'pop'],[np.nan,'a','b']])
df1.isnull()
df1.notnull()
df1.isnull().sum()
df1.isnull().sum().sum()
df1.isnull().any()
df1.isnull().values.any()
df1.info()
df1.dropna()
df2.ix[2,:] = np.nan
df2[3] = np.nan
df2.dropna(how='all')
df2.dropna(how='all',axis=1)
df2.fillna(0)
df2.fillna({0:1,1:6,2:9,3:11})
df2.fillna({1:6,3:0},inplace=True)
df2.fillna(method='ffill')
df2[0] = df2[0].fillna(df2[0].mean())
2.重复数据
data = {
'name':['张三', '李四', '张三', '小明'],
'sex':['female', 'male', 'female', 'male'],
'year':[2001, 2002, 2001, 2002],
'city':['北京', '上海', '北京', '北京']
}
df1 = DataFrame(data)
df1.duplicated()
df1.drop_duplicates(inplace=True)
df1.drop_duplicates(['sex','year'])
df1.drop_duplicates(['sex','year'],keep='last')、
3.替换值
data = {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'male', '', 'male'],
'year':[2001, 2003, 2001, 2002],
'city':['北京', '上海', '', '北京']
}
df1 = DataFrame(data)
df1.replace(['',2001],['不详',2002])
df1.replace({'':'不详',2001:2002})
def f(x):
if x >= 90:
return '优秀'
elif 70<=x<90:
return '良好'
elif 60<=x<70:
return '合格'
else:
return '不合格'
df2['class'] = df2['math'].map(f)
del df2['class']
df2['class'] = df2['math'].apply(f)
4.虚拟变量
df = DataFrame({
'朝向':['东','南','东','西','北'],
'价格':[1200,2100,2300,2900,1400]
})
pd.get_dummies(df['朝向'])