python读取HDFS文件

2023年1月12日13:27:54
###方法一:使用hdfs库读取HDFS文件
###在读取数据时,要加上 encoding='utf-8',否则字符串前面会有b'xxx'
###先写入list,再转为df,注意要对数据进行分列,最后要对指定字段转换数据类型
from hdfs.client import Client
client = Client("http://hadoop-1-1:50070")

lines = []
with client.read("/user/spark/H2O/Wholesale_customers_data.csv", encoding='utf-8') as reader:
    for line in reader:  
        lines.append(line.strip())

column_str = lines[0]
column_list = column_str.split(',')

data = {"item_list":lines[1:]}

import pandas as pd
df = pd.DataFrame(data=data)
df[column_list] =  df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##重新指定列
df.drop("item_list", axis=1, inplace=True)  ##删除列

df.dtypes
"""
Region              object
Fresh               object
Milk                object
Grocery             object
Frozen              object
Detergents_Paper    object
Delicassen          object
target              object
dtype: object

"""


df = df.astype('int')  ##将object类型转为int64
df.dtypes
"""
Region              int64
Fresh               int64
Milk                int64
Grocery             int64
Frozen              int64
Detergents_Paper    int64
Delicassen          int64
target              int64
dtype: object
"""
###方法二:采用pydoop库读取HDFS文件
import pydoop.hdfs as hdfs

lines = []
with hdfs.open('/user/spark/security/iris.csv', 'rt') as f:
    for line in f:
        ##print(line)
        lines.append(line.strip())


column_list = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']

data = {"item_list":lines[0:]}

import pandas as pd
df = pd.DataFrame(data=data)
df[column_list] =  df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##重新指定列
df.drop("item_list", axis=1, inplace=True)  ##删除列

##调整数据类型
df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float64')

df.dtypes
"""
Sepal_Length    float64
Sepal_Width     float64
Petal_Length    float64
Petal_Width     float64
Species          object
dtype: object
"""
###直接运用pd.read_table进行数据读取操作
import pydoop.hdfs as hdfs
import pandas as pd

###此份数据含有表头
with hdfs.open('/user/spark/security/iris.csv', 'rt') as f:
    df = pd.read_table(f)


column_list = df.columns[0].split(",")
df[column_list] =  df.iloc[:,0].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##此处注意要写成df.iloc[:,0]

df.head()
"""
Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Species
0	5.1,3.5,1.4,0.2,setosa	5.1	3.5	1.4	0.2	setosa
1	4.9,3,1.4,0.2,setosa	4.9	3	1.4	0.2	setosa
2	4.7,3.2,1.3,0.2,setosa	4.7	3.2	1.3	0.2	setosa
3	4.6,3.1,1.5,0.2,setosa	4.6	3.1	1.5	0.2	setosa
4	5,3.6,1.4,0.2,setosa	5	3.6	1.4	0.2	setosa
"""


df.drop(df.columns[0], axis=1, inplace=True)
df.dtypes
"""
Sepal_Length    object
Sepal_Width     object
Petal_Length    object
Petal_Width     object
Species         object
dtype: object
"""


#####将'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'这四个字段转换为float类型
df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float')

df.dtypes
"""
Sepal_Length    float64
Sepal_Width     float64
Petal_Length    float64
Petal_Width     float64
Species          object
dtype: object
"""

转载于:https://my.oschina.net/kyo4321/blog/3016864

  • 作者:普通网友
  • 原文链接:https://blog.csdn.net/weixin_34185364/article/details/91902549
    更新时间:2023年1月12日13:27:54 ,共 2744 字。