如果 SparkContext 或 SQLContext 被传到 Worker,哪怕实际上在 Worker 中没使用上,都会报下面的错误
Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transforamtion. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
比如以下几种情况
class Test(object):
def __init__(self, spark_context):
self.spark_context = spark_context
def process(self, data):
return data * 2
def run(self, rdd):
# 实际上 self.process 并没有用到 self.spark_context, 但还是会报错
new_rdd = rdd.map(lambda x: self.process(x))
class Test(object):
spark_context = None
def __init__(self, spark_context):
Test.spark_context = spark_context
@staticmethod
def process(data):
return data * 2
def run(self, rdd):
# 实际上 Test.process 并没有用到 Test.spark_context, 但还是会报错
new_rdd = rdd.map(lambda x: Test.process(x))
class Test(object):
spark_context = None
def __init__(self, spark_context):
Test.spark_context = spark_context
def process(self, data):
return data * 2
def run(self, rdd):
# 实际上 self.process 并没有用到 Test.spark_context, 但还是会报错
new_rdd = rdd.map(lambda x: self.process(x))
改成下面的方式就可以
class Test(object):
spark_context = None
def __init__(self, spark_context):
self.spark_context = spark_context
@staticmethod
def process(data):
return data * 2
def run(self, rdd):
# 调用的是 Test.process,而 spark_context 保存在 self
new_rdd = rdd.map(lambda x: Test.process(x))