Pandas+groupby用法讲解

2022-07-18 08:37:12
import pandasas pdimport numpyas np
#导入数据
type_specified={"trip_id":"object","bikeid":"object","from_station_id":"object","to_station_id":"object","hour":"object","weekday":"object"}
ridership=pd.read_csv(r"E:\chicago_bikes_data\txt&csv\ridership_2016.txt",
                      nrows=10000,dtype=type_specified,usecols=range(1,15))
ridership.head()
trip_idstarttimestoptimebikeidtripdurationfrom_station_idfrom_station_nameto_station_idto_station_nameusertypegenderbirthyearhourweekday
090805513/31/2016 23:534/1/2016 0:07155841344Ravenswood Ave & Lawrence Ave458Broadway & Thorndale AveSubscriberMale1986.0234
190805503/31/2016 23:463/31/2016 23:574831649128Damen Ave & Chicago Ave213Leavitt St & North AveSubscriberMale1980.0234
290805493/31/2016 23:423/31/2016 23:464232210350Ashland Ave & Chicago Ave210Ashland Ave & Division StSubscriberMale1979.0234
390805483/31/2016 23:373/31/2016 23:5534641045303Broadway & Cornelia Ave458Broadway & Thorndale AveSubscriberMale1980.0234
490805473/31/2016 23:333/31/2016 23:371750202334Lake Shore Dr & Belmont Ave329Lake Shore Dr & Diversey PkwySubscriberMale1969.0234
#groupby可以通过传入需要分组的参数实现对数据的分组
ridership_user=ridership.groupby("usertype")
ridership_user_week=ridership.groupby(["usertype","weekday"])
#groupby之后的数据并不是DataFrame格式的数据,而是特殊的groupby类型,此时,可以通过size()方法返回分组后的记录数目统计结果,该结果#是Series类型
ridership_user.size()
usertype
Customer      1023
Subscriber    8977
dtype: int64
ridership_user_week.size()
usertype    weekday
Customer    3           286
            4           737
Subscriber  3          2714
            4          6263
dtype: int64
#可以通过索引的方式实现对分组后各类别记录数的提取
ridership_user_week.size().loc["Customer","3"]
286
#通过groupby完成对数据的分组后,可以通过get_group方法来获取某一制定分组的结果
ridership_user.get_group("Customer").head()
trip_idstarttimestoptimebikeidtripdurationfrom_station_idfrom_station_nameto_station_idto_station_nameusertypegenderbirthyearhourweekday
2390805263/31/2016 23:043/31/2016 23:3216381648505Winchester Ave & Elston Ave505Winchester Ave & Elston AveCustomerNaNNaN234
6590804813/31/2016 22:243/31/2016 22:4534381220152Lincoln Ave & Diversey Pkwy316Damen Ave & Sunnyside AveCustomerNaNNaN224
7790804633/31/2016 22:153/31/2016 22:1734649747State St & Kinzie St47State St & Kinzie StCustomerNaNNaN224
11390804263/31/2016 22:023/31/2016 22:2927161582340Clark St & Wrightwood Ave161Rush St & Superior StCustomerNaNNaN224
11590804243/31/2016 22:023/31/2016 22:2910521611340Clark St & Wrightwood Ave161Rush St & Superior StCustomerNaNNaN224
#对于多层分组后的数据,如果需要使用多重索引,此处需要传入一个元组
ridership_user_week.get_group(("Customer","3")).head()
trip_idstarttimestoptimebikeidtripdurationfrom_station_idfrom_station_nameto_station_idto_station_nameusertypegenderbirthyearhourweekday
700290712383/30/2016 23:573/31/2016 0:034018317305Western Ave & Division St374Western Ave & Walton StCustomerNaNNaN233
700790712333/30/2016 23:513/30/2016 23:564018316374Western Ave & Walton St305Western Ave & Division StCustomerNaNNaN233
701490712263/30/2016 23:283/30/2016 23:45114299371Morgan St & Lake St15Racine Ave & 18th StCustomerNaNNaN233
711790710313/30/2016 20:133/30/2016 20:3419191302284Michigan Ave & Jackson Blvd173Mies van der Rohe Way & Chicago AveCustomerNaNNaN203
711890710303/30/2016 20:133/30/2016 20:3429091312284Michigan Ave & Jackson Blvd173Mies van der Rohe Way & Chicago AveCustomerNaNNaN203
#对于分组后的数据,如果还需要进行一系列查询,可以使用apply方法,传入一个处理的函数#假设我们需要查找上述分组后每组中用时最长的一组记录的起始和终止站点,可执行如下命令
max_time_from_id=ridership_user_week.apply(lambda record:record["from_station_id"][record["tripduration"].idxmax()])
max_time_to_id=ridership_user_week.apply(lambda record:record["to_station_id"][record["tripduration"].idxmax()])
max_time_time=ridership_user_week.apply(lambda record:record["tripduration"][record["tripduration"].idxmax()])
max_time_ridership=pd.DataFrame({"from_station_id":max_time_from_id,"to_station_id":max_time_to_id,"tripduratio":max_time_time})
max_time_ridership
from_station_idto_station_idtripduratio
usertypeweekday
Customer33527781702
434022968571
Subscriber328725471758
41334970977
  • 作者:zhangxiaojiakele
  • 原文链接:https://blog.csdn.net/zhangxiaojiakele/article/details/78198568
    更新时间:2022-07-18 08:37:12