from pyspark import SparkConf, SparkContext import os os.environ["PYSPARK_PYTHON"] = "D:\programtool\conda\python" # 创建SparcConf类对象 conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app") # 基于SparkConf类对象创建SparkContext类对象 sc = SparkContext(conf=conf) # 打印版本 # print(sc.version) # 通过parallelize 方法将python对象加载到Spark内,成为RDD对象 rdd1 = sc.parallelize([1, 2, 3, 4, 5]) rdd2 = sc.parallelize((1, 2, 4, 5, 6)) rdd3 = sc.parallelize("342sdf") # print(rdd1.collect()) # print(rdd2.collect()) # print(rdd3.collect()) # rdd4 = sc.textFile("C语言程序设计(上).北京理工大学.csv") # print(rdd4.collect()) def func(data): return data * 10 # 通过map方法将rdd1全部数据乘以10 rdd5 = rdd1.map(func) # print(rdd5.collect()) # 通过flatMap方法进行解除嵌套 rdd6 = sc.parallelize(["hello world 232", 'welcome learn python 233']) rdd7 = rdd6.map(lambda x: x.split(" ")) rdd8 = rdd6.flatMap(lambda x: x.split(" ")) # print(rdd7.collect()) # print(rdd8.collect()) # reduceByKey两两计算 rdd9 = sc.parallelize([('男', 88), ("女", 68), ('男', 48), ('女', 38)]) rdd10 = rdd9.reduceByKey(lambda a, b: a + b) print(rdd10.collect()) # 停止SparkContext对象运行 sc.stop()