Files
python/python_13_spark.py
T

48 lines
1.3 KiB
Python
Raw Normal View History

2025-08-05 09:19:34 +08:00
from pyspark import SparkConf, SparkContext
import os
os.environ["PYSPARK_PYTHON"] = "D:\programtool\conda\python"
# 创建SparcConf类对象
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
# 基于SparkConf类对象创建SparkContext类对象
sc = SparkContext(conf=conf)
# 打印版本
# print(sc.version)
# 通过parallelize 方法将python对象加载到Spark内,成为RDD对象
rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize((1, 2, 4, 5, 6))
rdd3 = sc.parallelize("342sdf")
# print(rdd1.collect())
# print(rdd2.collect())
# print(rdd3.collect())
# rdd4 = sc.textFile("C语言程序设计(上).北京理工大学.csv")
# print(rdd4.collect())
def func(data):
return data * 10
# 通过map方法将rdd1全部数据乘以10
rdd5 = rdd1.map(func)
# print(rdd5.collect())
# 通过flatMap方法进行解除嵌套
rdd6 = sc.parallelize(["hello world 232", 'welcome learn python 233'])
rdd7 = rdd6.map(lambda x: x.split(" "))
rdd8 = rdd6.flatMap(lambda x: x.split(" "))
# print(rdd7.collect())
# print(rdd8.collect())
# reduceByKey两两计算
rdd9 = sc.parallelize([('', 88), ("", 68), ('', 48), ('', 38)])
rdd10 = rdd9.reduceByKey(lambda a, b: a + b)
print(rdd10.collect())
# 停止SparkContext对象运行
sc.stop()